|
- // g++ -std=c++14 -o main -O3 -g main.cpp
- // g++ -std=c++14 -o main -Ofast -g main.cpp
- #include <cstdlib>
-
- #include <iostream>
- #include <memory>
- #include <random>
- #include <chrono>
-
- #define BENCHMARK(X) do { \
- auto start = std::chrono::high_resolution_clock::now(); \
- X; \
- auto end = std::chrono::high_resolution_clock::now(); \
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start); \
- std::cout << "Elpased Time: " << duration.count() << " ns" << std::endl; \
- } while (0)
-
- int fasternv12resizecropdet(const unsigned char *yuv_raw, unsigned char yuv_undistort, int *fastmap, int width, int height)
- {
- #if defined(__ARM__NEON__)
- uint32_t tmp[4] = {0};
- tmp[0] = 3;
- tmp[1] = SRC_WIDTH;
-
- asm volatile (
- "mov x1, %[hei]\n" //< x1 for height loop
- "mov x2, %[map]\n" //< x2 for map pointer
- "mov x3, %[yuv444_dst]\n" //< x3 for dst pointer
- "ld1 {v2.4s}, [%[tmp]]\n"
-
- "height_loop:"
- "mov x0, %[wid]\n" //< x0 for width loop
-
- "width_loop:"
- "ld2 {v0.4s, v1.4s}, [x2], #32\n" //< v0 is u_distort,v1 is v_distort
- "mul v0.4s, v0.4s, v2.4s[0]\n"
- "mul v1.4s, v1.4s, v2.4s[0]\n"
- "mul v1.4s, v1.4s, v2.4s[1]\n"
- "add v0.4s, v0.4s, v1.4s[0]\n"
-
- "mov w4, v0.4s[0]\n" //< get the offset of the first pixel
- "ldr x4, [%[yuv_raw], x4]\n"
- "str x4, [x3], #3\n" //< store the first pixel's yuv into memory
- "mov w4, v0.4s[1]\n"
- "ldr x4, [%[yuv_raw], x4]\n"
- "str x4, [x3], #3\n"
- "mov w4, v0.4s[2]\n"
- "ldr x4, [%[yuv_raw], x4]\n"
- "str x4, [x3], #3\n"
- "mov w4, v0.4s[3]\n"
- "ldr x4, [%[yuv_raw], x4]\n"
- "str x4, [x3], #3\n"
-
- "sub x0, x0, #4\n"
- "cmp x0, #0\n"
- "bgt width_loop\n"
-
- "sub x1, x1, #1\n"
- "cmp x1, #0\n"
- "bgt height_loop\n"
- : [yuv444_dst] "+r" (yuv_undistort)
- : [yuv_raw] "r" (yev_raw), [wid] "r" (width), [hei] "r" (height), [map] "r" (fastmap), [tmp] "r" (tmp)
- : "memory", "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2"
- );
- #endif /* __ARM__NEON__ */
-
- return 0;
- }
-
- int fastnv12resizecropdet(const unsigned char *src_buffer, int round, unsigned char *des_buffer, int *fastmap, int det_target_w, int det_target_h)
- {
- int idx;
- int idx1;
- int idx2;
-
- for (int i = 0; i < round; ++i)
- {
- idx = (i << 2) + i;
- int h = fastmap[idx + 0];
- int w = fastmap[idx + 1];
- int y = fastmap[idx + 2];
- int u = fastmap[idx + 4];
- int v = u + 1;
-
- idx1 = ((h >> 1) + det_target_h) * det_target_w + (w & 0xfffffffe);
-
- des_buffer[h * det_target_w + w] = src_buffer[y];
- des_buffer[idx1] = src_buffer[u];
- des_buffer[idx1 + 1] = src_buffer[v];
- }
-
- return 0;
- }
-
- int main()
- {
- #define FASTMAP_SIZE static_cast<size_t>(768 * 768)
- #define BUFFER_SIZE static_cast<szie_t>(1920 * 1300)
-
- const size_t alignment = 16;
- size_t fastmap_offset = alignment - (FASTMAP_SIZE % alignment);
- size_t buffer_offset = alignment - (BUFFER_SIZE % alignment);
-
- // 分配内存并进行对齐
- void* fastmap_raw_memory = std::aligned_alloc(alignment, FASTMAP_SIZE + offset);
- void* alignedMemory = static_cast<char*>(rawMemory) + offset;
-
- // 创建unique_ptr并指定自定义删除器
- std::unique_ptr<int[], void(*)(void*)> arrPtr(
- static_cast<int*>(alignedMemory),
- [](void* ptr) { std::free(ptr); }
- );
-
- // Generate random fastmap: 768x768, src_buffer: 1920x1300
- auto fastmap = std::make_unique<int[]>(FASTMAP_SIZE);
- auto src_buffer = std::make_unique<unsigned char[]>(BUFFER_SIZE);
- auto dst_buffer = std::make_unique<unsigned char[]>(BUFFER_SIZE);
-
- // Fill with random data
- std::random_device rd;
- std::mt19937 gen(rd());
- std::uniform_int_distribution<> distrib(0, 127);
-
- // fastmap
- int *fp = fastmap.get();
- for (int i = 0; i < FASTMAP_SIZE; ++i)
- {
- fp[i] = distrib(gen);
- }
-
- // src_buffer
- unsigned char *sp = src_buffer.get();
- for (int i = 0; i < BUFFER_SIZE; ++i)
- {
- sp[i] = static_cast<unsigned char>(distrib(gen));
- }
-
- auto start = std::chrono::high_resolution_clock::now();
-
- for (int i = 0, j = 0; i < 1000000; ++i, ++j)
- {
- fastnv12resizecropdet(sp, 100 + j, dst_buffer.get(), fp, 500 + j, 500 + j);
- if (j >= 100)
- {
- j = 0;
- }
- }
-
- auto end = std::chrono::high_resolution_clock::now(); \
- auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start); \
- std::cout << "Elpased Time: " << duration.count() << " ms" << std::endl; \
-
- return 0;
- }
|