// g++ -std=c++14 -o main -O3 -g main.cpp // g++ -std=c++14 -o main -Ofast -g main.cpp #include #include #include #include #include #define BENCHMARK(X) do { \ auto start = std::chrono::high_resolution_clock::now(); \ X; \ auto end = std::chrono::high_resolution_clock::now(); \ auto duration = std::chrono::duration_cast(end - start); \ std::cout << "Elpased Time: " << duration.count() << " ns" << std::endl; \ } while (0) int fasternv12resizecropdet(const unsigned char *yuv_raw, unsigned char yuv_undistort, int *fastmap, int width, int height) { #if defined(__ARM__NEON__) uint32_t tmp[4] = {0}; tmp[0] = 3; tmp[1] = SRC_WIDTH; asm volatile ( "mov x1, %[hei]\n" //< x1 for height loop "mov x2, %[map]\n" //< x2 for map pointer "mov x3, %[yuv444_dst]\n" //< x3 for dst pointer "ld1 {v2.4s}, [%[tmp]]\n" "height_loop:" "mov x0, %[wid]\n" //< x0 for width loop "width_loop:" "ld2 {v0.4s, v1.4s}, [x2], #32\n" //< v0 is u_distort,v1 is v_distort "mul v0.4s, v0.4s, v2.4s[0]\n" "mul v1.4s, v1.4s, v2.4s[0]\n" "mul v1.4s, v1.4s, v2.4s[1]\n" "add v0.4s, v0.4s, v1.4s[0]\n" "mov w4, v0.4s[0]\n" //< get the offset of the first pixel "ldr x4, [%[yuv_raw], x4]\n" "str x4, [x3], #3\n" //< store the first pixel's yuv into memory "mov w4, v0.4s[1]\n" "ldr x4, [%[yuv_raw], x4]\n" "str x4, [x3], #3\n" "mov w4, v0.4s[2]\n" "ldr x4, [%[yuv_raw], x4]\n" "str x4, [x3], #3\n" "mov w4, v0.4s[3]\n" "ldr x4, [%[yuv_raw], x4]\n" "str x4, [x3], #3\n" "sub x0, x0, #4\n" "cmp x0, #0\n" "bgt width_loop\n" "sub x1, x1, #1\n" "cmp x1, #0\n" "bgt height_loop\n" : [yuv444_dst] "+r" (yuv_undistort) : [yuv_raw] "r" (yev_raw), [wid] "r" (width), [hei] "r" (height), [map] "r" (fastmap), [tmp] "r" (tmp) : "memory", "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2" ); #endif /* __ARM__NEON__ */ return 0; } int fastnv12resizecropdet(const unsigned char *src_buffer, int round, unsigned char *des_buffer, int *fastmap, int det_target_w, int det_target_h) { int idx; int idx1; int idx2; for (int i = 0; i < round; ++i) { idx = (i << 2) + i; int h = fastmap[idx + 0]; int w = fastmap[idx + 1]; int y = fastmap[idx + 2]; int u = fastmap[idx + 4]; int v = u + 1; idx1 = ((h >> 1) + det_target_h) * det_target_w + (w & 0xfffffffe); des_buffer[h * det_target_w + w] = src_buffer[y]; des_buffer[idx1] = src_buffer[u]; des_buffer[idx1 + 1] = src_buffer[v]; } return 0; } int main() { #define FASTMAP_SIZE static_cast(768 * 768) #define BUFFER_SIZE static_cast(1920 * 1300) const size_t alignment = 16; size_t fastmap_offset = alignment - (FASTMAP_SIZE % alignment); size_t buffer_offset = alignment - (BUFFER_SIZE % alignment); // 分配内存并进行对齐 void* fastmap_raw_memory = std::aligned_alloc(alignment, FASTMAP_SIZE + offset); void* alignedMemory = static_cast(rawMemory) + offset; // 创建unique_ptr并指定自定义删除器 std::unique_ptr arrPtr( static_cast(alignedMemory), [](void* ptr) { std::free(ptr); } ); // Generate random fastmap: 768x768, src_buffer: 1920x1300 auto fastmap = std::make_unique(FASTMAP_SIZE); auto src_buffer = std::make_unique(BUFFER_SIZE); auto dst_buffer = std::make_unique(BUFFER_SIZE); // Fill with random data std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<> distrib(0, 127); // fastmap int *fp = fastmap.get(); for (int i = 0; i < FASTMAP_SIZE; ++i) { fp[i] = distrib(gen); } // src_buffer unsigned char *sp = src_buffer.get(); for (int i = 0; i < BUFFER_SIZE; ++i) { sp[i] = static_cast(distrib(gen)); } auto start = std::chrono::high_resolution_clock::now(); for (int i = 0, j = 0; i < 1000000; ++i, ++j) { fastnv12resizecropdet(sp, 100 + j, dst_buffer.get(), fp, 500 + j, 500 + j); if (j >= 100) { j = 0; } } auto end = std::chrono::high_resolution_clock::now(); \ auto duration = std::chrono::duration_cast(end - start); \ std::cout << "Elpased Time: " << duration.count() << " ms" << std::endl; \ return 0; }