| @@ -0,0 +1,310 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static inline void interpolate_cubic(float fx, float* coeffs) | |||
| { | |||
| const float A = -0.75f; | |||
| float fx0 = fx + 1; | |||
| float fx1 = fx; | |||
| float fx2 = 1 - fx; | |||
| // float fx3 = 2 - fx; | |||
| coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; | |||
| coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; | |||
| coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; | |||
| coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; | |||
| } | |||
| static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner) | |||
| { | |||
| double scale = (double)w / outw; | |||
| if (align_corner) | |||
| { | |||
| scale = (double)(w - 1) / (outw - 1); | |||
| } | |||
| for (int dx = 0; dx < outw; dx++) | |||
| { | |||
| float fx = (float)((dx + 0.5) * scale - 0.5); | |||
| if (align_corner) | |||
| { | |||
| fx = (float)(dx * scale); | |||
| } | |||
| int sx = static_cast<int>(floor(fx)); | |||
| fx -= sx; | |||
| interpolate_cubic(fx, alpha + dx * 4); | |||
| if (sx <= -1) | |||
| { | |||
| sx = 1; | |||
| alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3]; | |||
| alpha[dx * 4 + 1] = alpha[dx * 4 + 3]; | |||
| alpha[dx * 4 + 2] = 0.f; | |||
| alpha[dx * 4 + 3] = 0.f; | |||
| } | |||
| if (sx == 0) | |||
| { | |||
| sx = 1; | |||
| alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1]; | |||
| alpha[dx * 4 + 1] = alpha[dx * 4 + 2]; | |||
| alpha[dx * 4 + 2] = alpha[dx * 4 + 3]; | |||
| alpha[dx * 4 + 3] = 0.f; | |||
| } | |||
| if (sx == w - 2) | |||
| { | |||
| sx = w - 3; | |||
| alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3]; | |||
| alpha[dx * 4 + 2] = alpha[dx * 4 + 1]; | |||
| alpha[dx * 4 + 1] = alpha[dx * 4 + 0]; | |||
| alpha[dx * 4 + 0] = 0.f; | |||
| } | |||
| if (sx >= w - 1) | |||
| { | |||
| sx = w - 3; | |||
| alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0]; | |||
| alpha[dx * 4 + 2] = alpha[dx * 4 + 0]; | |||
| alpha[dx * 4 + 1] = 0.f; | |||
| alpha[dx * 4 + 0] = 0.f; | |||
| } | |||
| xofs[dx] = sx; | |||
| } | |||
| } | |||
| static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w); | |||
| Mat rowsbuf1(w); | |||
| Mat rowsbuf2(w); | |||
| Mat rowsbuf3(w); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| float* rows2 = rowsbuf2; | |||
| float* rows3 = rowsbuf3; | |||
| int prev_sy1 = -3; | |||
| for (int dy = 0; dy < h; dy++) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows2; | |||
| rows2 = rows3; | |||
| rows3 = rows0_old; | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 2) | |||
| { | |||
| // hresize two rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| rows0 = rows2; | |||
| rows1 = rows3; | |||
| rows2 = rows0_old; | |||
| rows3 = rows1_old; | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; | |||
| rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 3) | |||
| { | |||
| // hresize three rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| float* rows2_old = rows2; | |||
| rows0 = rows3; | |||
| rows1 = rows0_old; | |||
| rows2 = rows1_old; | |||
| rows3 = rows2_old; | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3; | |||
| rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; | |||
| rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize four rows | |||
| const float* S0 = src.row(sy - 1); | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3; | |||
| rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3; | |||
| rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; | |||
| rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float b0 = beta[0]; | |||
| float b1 = beta[1]; | |||
| float b2 = beta[2]; | |||
| float b3 = beta[3]; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| float* Dp = dst.row(dy); | |||
| int dx = 0; | |||
| #if __SSE2__ | |||
| #if __AVX__ | |||
| __m256 _b0_256 = _mm256_set1_ps(b0); | |||
| __m256 _b1_256 = _mm256_set1_ps(b1); | |||
| __m256 _b2_256 = _mm256_set1_ps(b2); | |||
| __m256 _b3_256 = _mm256_set1_ps(b3); | |||
| for (; dx + 7 < w; dx += 8) | |||
| { | |||
| __m256 _rows0 = _mm256_loadu_ps(rows0p); | |||
| __m256 _rows1 = _mm256_loadu_ps(rows1p); | |||
| __m256 _rows2 = _mm256_loadu_ps(rows2p); | |||
| __m256 _rows3 = _mm256_loadu_ps(rows3p); | |||
| __m256 _D = _mm256_mul_ps(_rows0, _b0_256); | |||
| _D = _mm256_comp_fmadd_ps(_rows1, _b1_256, _D); | |||
| _D = _mm256_comp_fmadd_ps(_rows2, _b2_256, _D); | |||
| _D = _mm256_comp_fmadd_ps(_rows3, _b3_256, _D); | |||
| _mm256_storeu_ps(Dp, _D); | |||
| Dp += 8; | |||
| rows0p += 8; | |||
| rows1p += 8; | |||
| rows2p += 8; | |||
| rows3p += 8; | |||
| } | |||
| #endif // __AVX__ | |||
| __m128 _b0_128 = _mm_set1_ps(b0); | |||
| __m128 _b1_128 = _mm_set1_ps(b1); | |||
| __m128 _b2_128 = _mm_set1_ps(b2); | |||
| __m128 _b3_128 = _mm_set1_ps(b3); | |||
| for (; dx + 3 < w; dx += 4) | |||
| { | |||
| __m128 _rows0 = _mm_loadu_ps(rows0p); | |||
| __m128 _rows1 = _mm_loadu_ps(rows1p); | |||
| __m128 _rows2 = _mm_loadu_ps(rows2p); | |||
| __m128 _rows3 = _mm_loadu_ps(rows3p); | |||
| __m128 _D = _mm_mul_ps(_rows0, _b0_128); | |||
| _D = _mm_comp_fmadd_ps(_rows1, _b1_128, _D); | |||
| _D = _mm_comp_fmadd_ps(_rows2, _b2_128, _D); | |||
| _D = _mm_comp_fmadd_ps(_rows3, _b3_128, _D); | |||
| _mm_storeu_ps(Dp, _D); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| rows2p += 4; | |||
| rows3p += 4; | |||
| } | |||
| #endif // __SSE2__ | |||
| for (; dx < w; dx++) | |||
| { | |||
| *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3; | |||
| } | |||
| beta += 4; | |||
| } | |||
| } | |||
| @@ -0,0 +1,286 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf1(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf2(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf3(w, (size_t)4 * 4u, 4); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| float* rows2 = rowsbuf2; | |||
| float* rows3 = rowsbuf3; | |||
| int prev_sy1 = -3; | |||
| for (int dy = 0; dy < h; dy++) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows2; | |||
| rows2 = rows3; | |||
| rows3 = rows0_old; | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S3p = S3 + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _a2 = _mm_set1_ps(alphap[2]); | |||
| __m128 _a3 = _mm_set1_ps(alphap[3]); | |||
| __m128 _S30 = _mm_load_ps(S3p - 4); | |||
| __m128 _S31 = _mm_load_ps(S3p + 0); | |||
| __m128 _S32 = _mm_load_ps(S3p + 4); | |||
| __m128 _S33 = _mm_load_ps(S3p + 8); | |||
| __m128 _rows3 = _mm_mul_ps(_S30, _a0); | |||
| _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm_store_ps(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 2) | |||
| { | |||
| // hresize two rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| rows0 = rows2; | |||
| rows1 = rows3; | |||
| rows2 = rows0_old; | |||
| rows3 = rows1_old; | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _a2 = _mm_set1_ps(alphap[2]); | |||
| __m128 _a3 = _mm_set1_ps(alphap[3]); | |||
| __m128 _S20 = _mm_load_ps(S2p - 4); | |||
| __m128 _S21 = _mm_load_ps(S2p + 0); | |||
| __m128 _S22 = _mm_load_ps(S2p + 4); | |||
| __m128 _S23 = _mm_load_ps(S2p + 8); | |||
| __m128 _S30 = _mm_load_ps(S3p - 4); | |||
| __m128 _S31 = _mm_load_ps(S3p + 0); | |||
| __m128 _S32 = _mm_load_ps(S3p + 4); | |||
| __m128 _S33 = _mm_load_ps(S3p + 8); | |||
| __m128 _rows2 = _mm_mul_ps(_S20, _a0); | |||
| __m128 _rows3 = _mm_mul_ps(_S30, _a0); | |||
| _rows2 = _mm_comp_fmadd_ps(_S21, _a1, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows2 = _mm_comp_fmadd_ps(_S22, _a2, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows2 = _mm_comp_fmadd_ps(_S23, _a3, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm_store_ps(rows2p + dx * 4, _rows2); | |||
| _mm_store_ps(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 3) | |||
| { | |||
| // hresize three rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| float* rows2_old = rows2; | |||
| rows0 = rows3; | |||
| rows1 = rows0_old; | |||
| rows2 = rows1_old; | |||
| rows3 = rows2_old; | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _a2 = _mm_set1_ps(alphap[2]); | |||
| __m128 _a3 = _mm_set1_ps(alphap[3]); | |||
| __m128 _S10 = _mm_load_ps(S1p - 4); | |||
| __m128 _S11 = _mm_load_ps(S1p + 0); | |||
| __m128 _S12 = _mm_load_ps(S1p + 4); | |||
| __m128 _S13 = _mm_load_ps(S1p + 8); | |||
| __m128 _S20 = _mm_load_ps(S2p - 4); | |||
| __m128 _S21 = _mm_load_ps(S2p + 0); | |||
| __m128 _S22 = _mm_load_ps(S2p + 4); | |||
| __m128 _S23 = _mm_load_ps(S2p + 8); | |||
| __m128 _S30 = _mm_load_ps(S3p - 4); | |||
| __m128 _S31 = _mm_load_ps(S3p + 0); | |||
| __m128 _S32 = _mm_load_ps(S3p + 4); | |||
| __m128 _S33 = _mm_load_ps(S3p + 8); | |||
| __m128 _rows1 = _mm_mul_ps(_S10, _a0); | |||
| __m128 _rows2 = _mm_mul_ps(_S20, _a0); | |||
| __m128 _rows3 = _mm_mul_ps(_S30, _a0); | |||
| _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _rows2 = _mm_comp_fmadd_ps(_S21, _a1, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows1 = _mm_comp_fmadd_ps(_S12, _a2, _rows1); | |||
| _rows2 = _mm_comp_fmadd_ps(_S22, _a2, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows1 = _mm_comp_fmadd_ps(_S13, _a3, _rows1); | |||
| _rows2 = _mm_comp_fmadd_ps(_S23, _a3, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm_store_ps(rows1p + dx * 4, _rows1); | |||
| _mm_store_ps(rows2p + dx * 4, _rows2); | |||
| _mm_store_ps(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize four rows | |||
| const float* S0 = src.row(sy - 1); | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _a2 = _mm_set1_ps(alphap[2]); | |||
| __m128 _a3 = _mm_set1_ps(alphap[3]); | |||
| __m128 _S00 = _mm_load_ps(S0p - 4); | |||
| __m128 _S01 = _mm_load_ps(S0p + 0); | |||
| __m128 _S02 = _mm_load_ps(S0p + 4); | |||
| __m128 _S03 = _mm_load_ps(S0p + 8); | |||
| __m128 _S10 = _mm_load_ps(S1p - 4); | |||
| __m128 _S11 = _mm_load_ps(S1p + 0); | |||
| __m128 _S12 = _mm_load_ps(S1p + 4); | |||
| __m128 _S13 = _mm_load_ps(S1p + 8); | |||
| __m128 _S20 = _mm_load_ps(S2p - 4); | |||
| __m128 _S21 = _mm_load_ps(S2p + 0); | |||
| __m128 _S22 = _mm_load_ps(S2p + 4); | |||
| __m128 _S23 = _mm_load_ps(S2p + 8); | |||
| __m128 _S30 = _mm_load_ps(S3p - 4); | |||
| __m128 _S31 = _mm_load_ps(S3p + 0); | |||
| __m128 _S32 = _mm_load_ps(S3p + 4); | |||
| __m128 _S33 = _mm_load_ps(S3p + 8); | |||
| __m128 _rows0 = _mm_mul_ps(_S00, _a0); | |||
| __m128 _rows1 = _mm_mul_ps(_S10, _a0); | |||
| __m128 _rows2 = _mm_mul_ps(_S20, _a0); | |||
| __m128 _rows3 = _mm_mul_ps(_S30, _a0); | |||
| _rows0 = _mm_comp_fmadd_ps(_S01, _a1, _rows0); | |||
| _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _rows2 = _mm_comp_fmadd_ps(_S21, _a1, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows0 = _mm_comp_fmadd_ps(_S02, _a2, _rows0); | |||
| _rows1 = _mm_comp_fmadd_ps(_S12, _a2, _rows1); | |||
| _rows2 = _mm_comp_fmadd_ps(_S22, _a2, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows0 = _mm_comp_fmadd_ps(_S03, _a3, _rows0); | |||
| _rows1 = _mm_comp_fmadd_ps(_S13, _a3, _rows1); | |||
| _rows2 = _mm_comp_fmadd_ps(_S23, _a3, _rows2); | |||
| _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm_store_ps(rows0p + dx * 4, _rows0); | |||
| _mm_store_ps(rows1p + dx * 4, _rows1); | |||
| _mm_store_ps(rows2p + dx * 4, _rows2); | |||
| _mm_store_ps(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| __m128 _b0 = _mm_set1_ps(beta[0]); | |||
| __m128 _b1 = _mm_set1_ps(beta[1]); | |||
| __m128 _b2 = _mm_set1_ps(beta[2]); | |||
| __m128 _b3 = _mm_set1_ps(beta[3]); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| float* Dp = dst.row(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| __m128 _rows0 = _mm_load_ps(rows0p); | |||
| __m128 _rows1 = _mm_load_ps(rows1p); | |||
| __m128 _rows2 = _mm_load_ps(rows2p); | |||
| __m128 _rows3 = _mm_load_ps(rows3p); | |||
| __m128 _D = _mm_mul_ps(_rows0, _b0); | |||
| _D = _mm_comp_fmadd_ps(_rows1, _b1, _D); | |||
| _D = _mm_comp_fmadd_ps(_rows2, _b2, _D); | |||
| _D = _mm_comp_fmadd_ps(_rows3, _b3, _D); | |||
| _mm_store_ps(Dp, _D); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| rows2p += 4; | |||
| rows3p += 4; | |||
| } | |||
| beta += 4; | |||
| } | |||
| } | |||
| @@ -0,0 +1,286 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bicubic_image_pack8(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)8 * 4u, 8); | |||
| Mat rowsbuf1(w, (size_t)8 * 4u, 8); | |||
| Mat rowsbuf2(w, (size_t)8 * 4u, 8); | |||
| Mat rowsbuf3(w, (size_t)8 * 4u, 8); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| float* rows2 = rowsbuf2; | |||
| float* rows3 = rowsbuf3; | |||
| int prev_sy1 = -3; | |||
| for (int dy = 0; dy < h; dy++) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows2; | |||
| rows2 = rows3; | |||
| rows3 = rows0_old; | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 8; | |||
| const float* S3p = S3 + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _a2 = _mm256_set1_ps(alphap[2]); | |||
| __m256 _a3 = _mm256_set1_ps(alphap[3]); | |||
| __m256 _S30 = _mm256_load_ps(S3p - 8); | |||
| __m256 _S31 = _mm256_load_ps(S3p + 0); | |||
| __m256 _S32 = _mm256_load_ps(S3p + 8); | |||
| __m256 _S33 = _mm256_load_ps(S3p + 16); | |||
| __m256 _rows3 = _mm256_mul_ps(_S30, _a0); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm256_store_ps(rows3p + dx * 8, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 2) | |||
| { | |||
| // hresize two rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| rows0 = rows2; | |||
| rows1 = rows3; | |||
| rows2 = rows0_old; | |||
| rows3 = rows1_old; | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 8; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _a2 = _mm256_set1_ps(alphap[2]); | |||
| __m256 _a3 = _mm256_set1_ps(alphap[3]); | |||
| __m256 _S20 = _mm256_load_ps(S2p - 8); | |||
| __m256 _S21 = _mm256_load_ps(S2p + 0); | |||
| __m256 _S22 = _mm256_load_ps(S2p + 8); | |||
| __m256 _S23 = _mm256_load_ps(S2p + 16); | |||
| __m256 _S30 = _mm256_load_ps(S3p - 8); | |||
| __m256 _S31 = _mm256_load_ps(S3p + 0); | |||
| __m256 _S32 = _mm256_load_ps(S3p + 8); | |||
| __m256 _S33 = _mm256_load_ps(S3p + 16); | |||
| __m256 _rows2 = _mm256_mul_ps(_S20, _a0); | |||
| __m256 _rows3 = _mm256_mul_ps(_S30, _a0); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S21, _a1, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S22, _a2, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S23, _a3, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm256_store_ps(rows2p + dx * 8, _rows2); | |||
| _mm256_store_ps(rows3p + dx * 8, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 3) | |||
| { | |||
| // hresize three rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| float* rows2_old = rows2; | |||
| rows0 = rows3; | |||
| rows1 = rows0_old; | |||
| rows2 = rows1_old; | |||
| rows3 = rows2_old; | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 8; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _a2 = _mm256_set1_ps(alphap[2]); | |||
| __m256 _a3 = _mm256_set1_ps(alphap[3]); | |||
| __m256 _S10 = _mm256_load_ps(S1p - 8); | |||
| __m256 _S11 = _mm256_load_ps(S1p + 0); | |||
| __m256 _S12 = _mm256_load_ps(S1p + 8); | |||
| __m256 _S13 = _mm256_load_ps(S1p + 16); | |||
| __m256 _S20 = _mm256_load_ps(S2p - 8); | |||
| __m256 _S21 = _mm256_load_ps(S2p + 0); | |||
| __m256 _S22 = _mm256_load_ps(S2p + 8); | |||
| __m256 _S23 = _mm256_load_ps(S2p + 16); | |||
| __m256 _S30 = _mm256_load_ps(S3p - 8); | |||
| __m256 _S31 = _mm256_load_ps(S3p + 0); | |||
| __m256 _S32 = _mm256_load_ps(S3p + 8); | |||
| __m256 _S33 = _mm256_load_ps(S3p + 16); | |||
| __m256 _rows1 = _mm256_mul_ps(_S10, _a0); | |||
| __m256 _rows2 = _mm256_mul_ps(_S20, _a0); | |||
| __m256 _rows3 = _mm256_mul_ps(_S30, _a0); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S21, _a1, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S12, _a2, _rows1); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S22, _a2, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S13, _a3, _rows1); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S23, _a3, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm256_store_ps(rows1p + dx * 8, _rows1); | |||
| _mm256_store_ps(rows2p + dx * 8, _rows2); | |||
| _mm256_store_ps(rows3p + dx * 8, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize four rows | |||
| const float* S0 = src.row(sy - 1); | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy + 1); | |||
| const float* S3 = src.row(sy + 2); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 8; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _a2 = _mm256_set1_ps(alphap[2]); | |||
| __m256 _a3 = _mm256_set1_ps(alphap[3]); | |||
| __m256 _S00 = _mm256_load_ps(S0p - 8); | |||
| __m256 _S01 = _mm256_load_ps(S0p + 0); | |||
| __m256 _S02 = _mm256_load_ps(S0p + 8); | |||
| __m256 _S03 = _mm256_load_ps(S0p + 16); | |||
| __m256 _S10 = _mm256_load_ps(S1p - 8); | |||
| __m256 _S11 = _mm256_load_ps(S1p + 0); | |||
| __m256 _S12 = _mm256_load_ps(S1p + 8); | |||
| __m256 _S13 = _mm256_load_ps(S1p + 16); | |||
| __m256 _S20 = _mm256_load_ps(S2p - 8); | |||
| __m256 _S21 = _mm256_load_ps(S2p + 0); | |||
| __m256 _S22 = _mm256_load_ps(S2p + 8); | |||
| __m256 _S23 = _mm256_load_ps(S2p + 16); | |||
| __m256 _S30 = _mm256_load_ps(S3p - 8); | |||
| __m256 _S31 = _mm256_load_ps(S3p + 0); | |||
| __m256 _S32 = _mm256_load_ps(S3p + 8); | |||
| __m256 _S33 = _mm256_load_ps(S3p + 16); | |||
| __m256 _rows0 = _mm256_mul_ps(_S00, _a0); | |||
| __m256 _rows1 = _mm256_mul_ps(_S10, _a0); | |||
| __m256 _rows2 = _mm256_mul_ps(_S20, _a0); | |||
| __m256 _rows3 = _mm256_mul_ps(_S30, _a0); | |||
| _rows0 = _mm256_comp_fmadd_ps(_S01, _a1, _rows0); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S21, _a1, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3); | |||
| _rows0 = _mm256_comp_fmadd_ps(_S02, _a2, _rows0); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S12, _a2, _rows1); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S22, _a2, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3); | |||
| _rows0 = _mm256_comp_fmadd_ps(_S03, _a3, _rows0); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S13, _a3, _rows1); | |||
| _rows2 = _mm256_comp_fmadd_ps(_S23, _a3, _rows2); | |||
| _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3); | |||
| _mm256_store_ps(rows0p + dx * 8, _rows0); | |||
| _mm256_store_ps(rows1p + dx * 8, _rows1); | |||
| _mm256_store_ps(rows2p + dx * 8, _rows2); | |||
| _mm256_store_ps(rows3p + dx * 8, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| __m256 _b0 = _mm256_set1_ps(beta[0]); | |||
| __m256 _b1 = _mm256_set1_ps(beta[1]); | |||
| __m256 _b2 = _mm256_set1_ps(beta[2]); | |||
| __m256 _b3 = _mm256_set1_ps(beta[3]); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| float* Dp = dst.row(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| __m256 _rows0 = _mm256_load_ps(rows0p); | |||
| __m256 _rows1 = _mm256_load_ps(rows1p); | |||
| __m256 _rows2 = _mm256_load_ps(rows2p); | |||
| __m256 _rows3 = _mm256_load_ps(rows3p); | |||
| __m256 _D = _mm256_mul_ps(_rows0, _b0); | |||
| _D = _mm256_comp_fmadd_ps(_rows1, _b1, _D); | |||
| _D = _mm256_comp_fmadd_ps(_rows2, _b2, _D); | |||
| _D = _mm256_comp_fmadd_ps(_rows3, _b3, _D); | |||
| _mm256_store_ps(Dp, _D); | |||
| Dp += 8; | |||
| rows0p += 8; | |||
| rows1p += 8; | |||
| rows2p += 8; | |||
| rows3p += 8; | |||
| } | |||
| beta += 4; | |||
| } | |||
| } | |||
| @@ -0,0 +1,171 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner) | |||
| { | |||
| double scale = (double)w / outw; | |||
| if (align_corner) | |||
| { | |||
| scale = (double)(w - 1) / (outw - 1); | |||
| } | |||
| for (int dx = 0; dx < outw; dx++) | |||
| { | |||
| float fx = (float)((dx + 0.5) * scale - 0.5); | |||
| if (align_corner) | |||
| { | |||
| fx = (float)(dx * scale); | |||
| } | |||
| int sx = floor(fx); | |||
| fx -= sx; | |||
| if (sx < 0) | |||
| { | |||
| sx = 0; | |||
| fx = 0.f; | |||
| } | |||
| if (sx >= w - 1) | |||
| { | |||
| sx = w - 2; | |||
| fx = 1.f; | |||
| } | |||
| xofs[dx] = sx; | |||
| alpha[dx * 2] = 1.f - fx; | |||
| alpha[dx * 2 + 1] = fx; | |||
| } | |||
| } | |||
| static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w); | |||
| Mat rowsbuf1(w); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int prev_sy1 = -2; | |||
| for (int dy = 0; dy < h; dy++) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const float* S1 = src.row(sy + 1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for (; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S1p = S1 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; | |||
| alphap += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize two rows | |||
| const float* S0 = src.row(sy); | |||
| const float* S1 = src.row(sy + 1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for (; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; | |||
| rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; | |||
| alphap += 2; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float b0 = beta[0]; | |||
| float b1 = beta[1]; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* Dp = dst.row(dy); | |||
| int dx = 0; | |||
| #if __SSE2__ | |||
| #if __AVX__ | |||
| __m256 _b0_256 = _mm256_set1_ps(b0); | |||
| __m256 _b1_256 = _mm256_set1_ps(b1); | |||
| for (; dx + 7 < w; dx += 8) | |||
| { | |||
| __m256 _rows0 = _mm256_loadu_ps(rows0p); | |||
| __m256 _rows1 = _mm256_loadu_ps(rows1p); | |||
| __m256 _D = _mm256_mul_ps(_rows0, _b0_256); | |||
| _D = _mm256_comp_fmadd_ps(_rows1, _b1_256, _D); | |||
| _mm256_storeu_ps(Dp, _D); | |||
| Dp += 8; | |||
| rows0p += 8; | |||
| rows1p += 8; | |||
| } | |||
| #endif // __AVX__ | |||
| __m128 _b0_128 = _mm_set1_ps(b0); | |||
| __m128 _b1_128 = _mm_set1_ps(b1); | |||
| for (; dx + 3 < w; dx += 4) | |||
| { | |||
| __m128 _rows0 = _mm_loadu_ps(rows0p); | |||
| __m128 _rows1 = _mm_loadu_ps(rows1p); | |||
| __m128 _D = _mm_mul_ps(_rows0, _b0_128); | |||
| _D = _mm_comp_fmadd_ps(_rows1, _b1_128, _D); | |||
| _mm_storeu_ps(Dp, _D); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| } | |||
| #endif // __SSE2__ | |||
| for (; dx < w; dx++) | |||
| { | |||
| *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; | |||
| } | |||
| beta += 2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,123 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf1(w, (size_t)4 * 4u, 4); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int prev_sy1 = -2; | |||
| for (int dy = 0; dy < h; dy++) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const float* S1 = src.row(sy + 1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for (; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S1p = S1 + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _S10 = _mm_load_ps(S1p); | |||
| __m128 _S11 = _mm_load_ps(S1p + 4); | |||
| __m128 _rows1 = _mm_mul_ps(_S10, _a0); | |||
| _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _mm_store_ps(rows1p + dx * 4, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize two rows | |||
| const float* S0 = src.row(sy); | |||
| const float* S1 = src.row(sy + 1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for (; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _S00 = _mm_load_ps(S0p); | |||
| __m128 _S01 = _mm_load_ps(S0p + 4); | |||
| __m128 _S10 = _mm_load_ps(S1p); | |||
| __m128 _S11 = _mm_load_ps(S1p + 4); | |||
| __m128 _rows0 = _mm_mul_ps(_S00, _a0); | |||
| __m128 _rows1 = _mm_mul_ps(_S10, _a0); | |||
| _rows0 = _mm_comp_fmadd_ps(_S01, _a1, _rows0); | |||
| _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _mm_store_ps(rows0p + dx * 4, _rows0); | |||
| _mm_store_ps(rows1p + dx * 4, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| __m128 _b0 = _mm_set1_ps(beta[0]); | |||
| __m128 _b1 = _mm_set1_ps(beta[1]); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* Dp = dst.row(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| __m128 _rows0 = _mm_load_ps(rows0p); | |||
| __m128 _rows1 = _mm_load_ps(rows1p); | |||
| __m128 _D = _mm_mul_ps(_rows0, _b0); | |||
| _D = _mm_comp_fmadd_ps(_rows1, _b1, _D); | |||
| _mm_store_ps(Dp, _D); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| } | |||
| beta += 2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,123 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bilinear_image_pack8(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)8 * 4u, 8); | |||
| Mat rowsbuf1(w, (size_t)8 * 4u, 8); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int prev_sy1 = -2; | |||
| for (int dy = 0; dy < h; dy++) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const float* S1 = src.row(sy + 1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for (; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 8; | |||
| const float* S1p = S1 + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _S10 = _mm256_load_ps(S1p); | |||
| __m256 _S11 = _mm256_load_ps(S1p + 8); | |||
| __m256 _rows1 = _mm256_mul_ps(_S10, _a0); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _mm256_store_ps(rows1p + dx * 8, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize two rows | |||
| const float* S0 = src.row(sy); | |||
| const float* S1 = src.row(sy + 1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for (; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 8; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _S00 = _mm256_load_ps(S0p); | |||
| __m256 _S01 = _mm256_load_ps(S0p + 8); | |||
| __m256 _S10 = _mm256_load_ps(S1p); | |||
| __m256 _S11 = _mm256_load_ps(S1p + 8); | |||
| __m256 _rows0 = _mm256_mul_ps(_S00, _a0); | |||
| __m256 _rows1 = _mm256_mul_ps(_S10, _a0); | |||
| _rows0 = _mm256_comp_fmadd_ps(_S01, _a1, _rows0); | |||
| _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1); | |||
| _mm256_store_ps(rows0p + dx * 8, _rows0); | |||
| _mm256_store_ps(rows1p + dx * 8, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| __m256 _b0 = _mm256_set1_ps(beta[0]); | |||
| __m256 _b1 = _mm256_set1_ps(beta[1]); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* Dp = dst.row(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| __m256 _rows0 = _mm256_load_ps(rows0p); | |||
| __m256 _rows1 = _mm256_load_ps(rows1p); | |||
| __m256 _D = _mm256_mul_ps(_rows0, _b0); | |||
| _D = _mm256_comp_fmadd_ps(_rows1, _b1, _D); | |||
| _mm256_store_ps(Dp, _D); | |||
| Dp += 8; | |||
| rows0p += 8; | |||
| rows1p += 8; | |||
| } | |||
| beta += 2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,690 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "interp_x86.h" | |||
| #include <math.h> | |||
| #if __SSE2__ | |||
| #include <emmintrin.h> | |||
| #if __AVX__ | |||
| #include <immintrin.h> | |||
| #endif // __AVX__ | |||
| #endif // __SSE2__ | |||
| #include "x86_usability.h" | |||
| namespace ncnn { | |||
| #include "interp_bicubic.h" | |||
| #include "interp_bilinear.h" | |||
| #if __SSE2__ | |||
| #include "interp_bicubic_pack4.h" | |||
| #include "interp_bilinear_pack4.h" | |||
| #if __AVX__ | |||
| #include "interp_bicubic_pack8.h" | |||
| #include "interp_bilinear_pack8.h" | |||
| #endif | |||
| #endif | |||
| Interp_x86::Interp_x86() | |||
| { | |||
| #if __SSE2__ | |||
| support_packing = true; | |||
| #endif // __SSE2__ | |||
| } | |||
| int Interp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| const Mat& reference_blob = bottom_blobs[1]; | |||
| Mat& top_blob = top_blobs[0]; | |||
| int h = bottom_blob.h; | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| int dims = bottom_blob.dims; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = reference_blob.w; | |||
| int outh = reference_blob.h; | |||
| if (dims == 1) | |||
| { | |||
| top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #if __SSE2__ | |||
| #if __AVX__ | |||
| if (elempack == 8) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < w; q++) | |||
| { | |||
| Mat top_blob_c = top_blob.channel(q); | |||
| __m256 _v = _mm256_load_ps((const float*)bottom_blob + q * 8); | |||
| top_blob_c.fill(_v); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __AVX__ | |||
| if (elempack == 4) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < w; q++) | |||
| { | |||
| Mat top_blob_c = top_blob.channel(q); | |||
| __m128 _v = _mm_load_ps((const float*)bottom_blob + q * 4); | |||
| top_blob_c.fill(_v); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __SSE2__ | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < w; q++) | |||
| { | |||
| Mat top_blob_c = top_blob.channel(q); | |||
| const float v = bottom_blob[q]; | |||
| top_blob_c.fill(v); | |||
| } | |||
| return 0; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| if (outw == w) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #if __SSE2__ | |||
| #if __AVX__ | |||
| if (elempack == 8) | |||
| { | |||
| if (resize_type == 1) // nearest | |||
| { | |||
| const float ws = outw ? w / (float)outw : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int)(x * ws), (w - 1)); | |||
| __m256 _p = _mm256_load_ps(ptr + in_x * 8); | |||
| _mm256_store_ps(outptr, _p); | |||
| outptr += 8; | |||
| } | |||
| } | |||
| } | |||
| if (resize_type == 2) // bilinear | |||
| { | |||
| int* buf = new int[outw + outw * 2]; | |||
| int* xofs = buf; | |||
| float* alpha = (float*)(buf + outw); | |||
| linear_coeffs(w, outw, xofs, alpha, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| const float* alphap = alpha; | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int sx = xofs[x] * 8; | |||
| const float* Sp = ptr + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _S0 = _mm256_load_ps(Sp); | |||
| __m256 _S1 = _mm256_load_ps(Sp + 8); | |||
| __m256 _p = _mm256_mul_ps(_S0, _a0); | |||
| _p = _mm256_comp_fmadd_ps(_S1, _a1, _p); | |||
| _mm256_store_ps(outptr, _p); | |||
| alphap += 2; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3) // bicubic | |||
| { | |||
| int* buf = new int[outw + outw * 4]; | |||
| int* xofs = buf; | |||
| float* alpha = (float*)(buf + outw); | |||
| cubic_coeffs(w, outw, xofs, alpha, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| const float* alphap = alpha; | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int sx = xofs[x] * 8; | |||
| const float* Sp = ptr + sx; | |||
| __m256 _a0 = _mm256_set1_ps(alphap[0]); | |||
| __m256 _a1 = _mm256_set1_ps(alphap[1]); | |||
| __m256 _a2 = _mm256_set1_ps(alphap[2]); | |||
| __m256 _a3 = _mm256_set1_ps(alphap[3]); | |||
| __m256 _S0 = _mm256_load_ps(Sp - 8); | |||
| __m256 _S1 = _mm256_load_ps(Sp + 0); | |||
| __m256 _S2 = _mm256_load_ps(Sp + 8); | |||
| __m256 _S3 = _mm256_load_ps(Sp + 16); | |||
| __m256 _p = _mm256_mul_ps(_S0, _a0); | |||
| _p = _mm256_comp_fmadd_ps(_S1, _a1, _p); | |||
| _p = _mm256_comp_fmadd_ps(_S2, _a2, _p); | |||
| _p = _mm256_comp_fmadd_ps(_S3, _a3, _p); | |||
| _mm256_store_ps(outptr, _p); | |||
| alphap += 4; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __AVX__ | |||
| if (elempack == 4) | |||
| { | |||
| if (resize_type == 1) // nearest | |||
| { | |||
| const float ws = outw ? w / (float)outw : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int)(x * ws), (w - 1)); | |||
| __m128 _p = _mm_load_ps(ptr + in_x * 4); | |||
| _mm_store_ps(outptr, _p); | |||
| outptr += 4; | |||
| } | |||
| } | |||
| } | |||
| if (resize_type == 2) // bilinear | |||
| { | |||
| int* buf = new int[outw + outw * 2]; | |||
| int* xofs = buf; | |||
| float* alpha = (float*)(buf + outw); | |||
| linear_coeffs(w, outw, xofs, alpha, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| const float* alphap = alpha; | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int sx = xofs[x] * 4; | |||
| const float* Sp = ptr + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _S0 = _mm_load_ps(Sp); | |||
| __m128 _S1 = _mm_load_ps(Sp + 4); | |||
| __m128 _p = _mm_mul_ps(_S0, _a0); | |||
| _p = _mm_comp_fmadd_ps(_S1, _a1, _p); | |||
| _mm_store_ps(outptr, _p); | |||
| alphap += 2; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3) // bicubic | |||
| { | |||
| int* buf = new int[outw + outw * 4]; | |||
| int* xofs = buf; | |||
| float* alpha = (float*)(buf + outw); | |||
| cubic_coeffs(w, outw, xofs, alpha, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| const float* alphap = alpha; | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int sx = xofs[x] * 4; | |||
| const float* Sp = ptr + sx; | |||
| __m128 _a0 = _mm_set1_ps(alphap[0]); | |||
| __m128 _a1 = _mm_set1_ps(alphap[1]); | |||
| __m128 _a2 = _mm_set1_ps(alphap[2]); | |||
| __m128 _a3 = _mm_set1_ps(alphap[3]); | |||
| __m128 _S0 = _mm_load_ps(Sp - 4); | |||
| __m128 _S1 = _mm_load_ps(Sp + 0); | |||
| __m128 _S2 = _mm_load_ps(Sp + 4); | |||
| __m128 _S3 = _mm_load_ps(Sp + 8); | |||
| __m128 _p = _mm_mul_ps(_S0, _a0); | |||
| _p = _mm_comp_fmadd_ps(_S1, _a1, _p); | |||
| _p = _mm_comp_fmadd_ps(_S2, _a2, _p); | |||
| _p = _mm_comp_fmadd_ps(_S3, _a3, _p); | |||
| _mm_store_ps(outptr, _p); | |||
| alphap += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __SSE2__ | |||
| if (resize_type == 1) // nearest | |||
| { | |||
| const float ws = outw ? w / (float)outw : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int)(x * ws), (w - 1)); | |||
| *outptr++ = ptr[in_x]; | |||
| } | |||
| } | |||
| } | |||
| if (resize_type == 2) // bilinear | |||
| { | |||
| int* buf = new int[outw + outw * 2]; | |||
| int* xofs = buf; | |||
| float* alpha = (float*)(buf + outw); | |||
| linear_coeffs(w, outw, xofs, alpha, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| const float* alphap = alpha; | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int sx = xofs[x]; | |||
| const float* Sp = ptr + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| *outptr++ = Sp[0] * a0 + Sp[1] * a1; | |||
| alphap += 2; | |||
| } | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3) // bicubic | |||
| { | |||
| int* buf = new int[outw + outw * 4]; | |||
| int* xofs = buf; | |||
| float* alpha = (float*)(buf + outw); | |||
| cubic_coeffs(w, outw, xofs, alpha, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float* ptr = bottom_blob.row(y); | |||
| float* outptr = top_blob.row(y); | |||
| const float* alphap = alpha; | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int sx = xofs[x]; | |||
| const float* Sp = ptr + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| if (outw == w && outh == h) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #if __SSE2__ | |||
| #if __AVX__ | |||
| if (elempack == 8) | |||
| { | |||
| if (resize_type == 1) // nearest | |||
| { | |||
| const float hs = outh ? h / (float)outh : 1.f / height_scale; | |||
| const float ws = outw ? w / (float)outw : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int)(y * hs), (h - 1)); | |||
| const float* ptr = src.row(in_y); | |||
| float* outptr = dst.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int)(x * ws), (w - 1)); | |||
| __m256 _p = _mm256_load_ps(ptr + in_x * 8); | |||
| _mm256_store_ps(outptr, _p); | |||
| outptr += 8; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (resize_type == 2) // bilinear | |||
| { | |||
| int* buf = new int[outw + outh + outw * 2 + outh * 2]; | |||
| int* xofs = buf; //new int[outw]; | |||
| int* yofs = buf + outw; //new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; | |||
| linear_coeffs(w, outw, xofs, alpha, align_corner); | |||
| linear_coeffs(h, outh, yofs, beta, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image_pack8(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3) // bicubic | |||
| { | |||
| int* buf = new int[outw + outh + outw * 4 + outh * 4]; | |||
| int* xofs = buf; //new int[outw]; | |||
| int* yofs = buf + outw; //new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; | |||
| cubic_coeffs(w, outw, xofs, alpha, align_corner); | |||
| cubic_coeffs(h, outh, yofs, beta, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bicubic_image_pack8(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __AVX__ | |||
| if (elempack == 4) | |||
| { | |||
| if (resize_type == 1) // nearest | |||
| { | |||
| const float hs = outh ? h / (float)outh : 1.f / height_scale; | |||
| const float ws = outw ? w / (float)outw : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int)(y * hs), (h - 1)); | |||
| const float* ptr = src.row(in_y); | |||
| float* outptr = dst.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int)(x * ws), (w - 1)); | |||
| __m128 _p = _mm_load_ps(ptr + in_x * 4); | |||
| _mm_store_ps(outptr, _p); | |||
| outptr += 4; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (resize_type == 2) // bilinear | |||
| { | |||
| int* buf = new int[outw + outh + outw * 2 + outh * 2]; | |||
| int* xofs = buf; //new int[outw]; | |||
| int* yofs = buf + outw; //new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; | |||
| linear_coeffs(w, outw, xofs, alpha, align_corner); | |||
| linear_coeffs(h, outh, yofs, beta, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3) // bicubic | |||
| { | |||
| int* buf = new int[outw + outh + outw * 4 + outh * 4]; | |||
| int* xofs = buf; //new int[outw]; | |||
| int* yofs = buf + outw; //new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; | |||
| cubic_coeffs(w, outw, xofs, alpha, align_corner); | |||
| cubic_coeffs(h, outh, yofs, beta, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __SSE2__ | |||
| if (resize_type == 1) // nearest | |||
| { | |||
| const float hs = outh ? h / (float)outh : 1.f / height_scale; | |||
| const float ws = outw ? w / (float)outw : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int)(y * hs), (h - 1)); | |||
| const float* ptr = src.row(in_y); | |||
| float* outptr = dst.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int)(x * ws), (w - 1)); | |||
| *outptr++ = ptr[in_x]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (resize_type == 2) // bilinear | |||
| { | |||
| int* buf = new int[outw + outh + outw * 2 + outh * 2]; | |||
| int* xofs = buf; //new int[outw]; | |||
| int* yofs = buf + outw; //new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; | |||
| linear_coeffs(w, outw, xofs, alpha, align_corner); | |||
| linear_coeffs(h, outh, yofs, beta, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3) // bicubic | |||
| { | |||
| int* buf = new int[outw + outh + outw * 4 + outh * 4]; | |||
| int* xofs = buf; //new int[outw]; | |||
| int* yofs = buf + outw; //new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; | |||
| cubic_coeffs(w, outw, xofs, alpha, align_corner); | |||
| cubic_coeffs(h, outh, yofs, beta, align_corner); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bicubic_image(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_INTERP_X86_H | |||
| #define LAYER_INTERP_X86_H | |||
| #include "interp.h" | |||
| namespace ncnn { | |||
| class Interp_x86 : virtual public Interp | |||
| { | |||
| public: | |||
| Interp_x86(); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_INTERP_X86_H | |||