Merge 75e90b47c4 into d68ea6b4d5

10 months ago · 2b095b8acc
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,4 @@ python/setup.py

 # Xmake
 .xmake/
 CMakePresets.json
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,7 +81,7 @@ option(NCNN_SIMPLEVK "minimal in-house vulkan loader" ON)
 option(NCNN_SYSTEM_GLSLANG "use system glslang library" OFF)
 option(NCNN_RUNTIME_CPU "runtime dispatch cpu routines" ON)
 option(NCNN_DISABLE_PIC "disable position-independent code" OFF)
 option(NCNN_BUILD_TESTS "build tests" OFF)
 option(NCNN_BUILD_TESTS "build tests" ON)
 option(NCNN_COVERAGE "build for coverage" OFF)
 option(NCNN_ASAN "build for address sanitizer" OFF)
 option(NCNN_BUILD_BENCHMARK "build benchmark" ON)
@@ -89,6 +89,7 @@ option(NCNN_PYTHON "build python api" OFF)
 option(NCNN_INT8 "int8 inference" ON)
 option(NCNN_BF16 "bf16 inference" ON)
 option(NCNN_FORCE_INLINE "force inline some function" ON)
 option(NCNN_MUTITHREAD "enable multi thread bata" ON)

 if(ANDROID OR IOS OR NCNN_SIMPLESTL)
    option(NCNN_DISABLE_RTTI "disable rtti" ON)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -48,6 +48,13 @@ if(ANDROID)
    list(APPEND ncnn_SRCS mat_pixel_android.cpp)
 endif()

 if(NCNN_MUTITHREAD)
    list(APPEND ncnn_SRCS thread.cpp)
    if(WIN32)
        list(APPEND ncnn_SRCS TheadInfo.cpp)
    endif()
 endif()

 ncnn_src_group(ncnn_SRCS "sources")

 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}")
@@ -267,6 +274,11 @@ if(NCNN_THREADS)
        target_link_libraries(ncnn PUBLIC pthread)
    endif()
 endif()
 if(NCNN_MUTITHREAD)
    if(NOT WIN32 AND (NOT NCNN_SIMPLEOMP) AND (NOT NCNN_SIMPLESTL))
        target_link_libraries(ncnn PUBLIC -pthread)
    endif()
 endif()

 if(NCNN_VULKAN)
    if(NCNN_SIMPLEVK)
--- a/src/TheadInfo.cpp
+++ b/src/TheadInfo.cpp
@@ -0,0 +1,69 @@
 #ifdef NCNN_MUTITHREAD
 #ifdef _WIN32

 #include "TheadInfo.h"
 namespace ncnn {

 // 初始化静态成员
 ThreadInfo* ThreadInfo::thread_info = nullptr;

 ThreadInfo::ThreadInfo(/* args */)
 {
    int groupCount = GetActiveProcessorGroupCount();
    for (WORD group = 0; group < groupCount; group++)
    {
        DWORD processorsInGroup = GetActiveProcessorCount(group);
        for (int i = 0; i < static_cast<int>(processorsInGroup); i++)
        {
            CoreInfo info;
            info.group = group;
            info.id = i + core_infos.size();
            info.affinity = (static_cast<DWORD_PTR>(1) << i);
            core_infos.push_back(info);
        }
    }
 }

 ThreadInfo* ThreadInfo::get()
 {
    static Mutex lock;
    AutoLock guard(lock);

    if (!thread_info)
    {
        thread_info = new ThreadInfo();
    }
    return thread_info;
 }

 CoreInfo ThreadInfo::getCurrentCore()
 {
    // 获取当前线程运行的CPU核心（支持多处理器组）
    DWORD_PTR process_affinity, system_affinity;
    GetProcessAffinityMask(GetCurrentProcess(), &process_affinity, &system_affinity);

    // 使用扩展API获取处理器组信息
    PROCESSOR_NUMBER proc_num;
    GetCurrentProcessorNumberEx(&proc_num);

    for (const auto& core : core_infos)
    {
        // 匹配组号和组内核心编号
        if (core.group == proc_num.Group && (core.affinity & (1ULL << proc_num.Number)))
        {
            return core;
        }
    }

    // 未找到时返回默认值
    return {-1, -1, 0};
 }

 void ThreadInfo::getAllCore(std::vector<CoreInfo>& out)
 {
    out = core_infos;
 }
 } // namespace ncnn

 #endif
 #endif
--- a/src/TheadInfo.h
+++ b/src/TheadInfo.h
@@ -0,0 +1,30 @@
 #ifndef THREAD_INFO_H
 #define THREAD_INFO_H
 #ifdef NCNN_MUTITHREAD
 #if defined _WIN32
 #include "cpu.h"
 namespace ncnn {
 struct CoreInfo
 {
 public:
    int id;
    int group;
    DWORD_PTR affinity;
 };
 class ThreadInfo
 {
 private:
    static ThreadInfo* thread_info;
    std::vector<CoreInfo> core_infos;
    ThreadInfo(/* args */);

 public:
    static ThreadInfo* get();
    CoreInfo getCurrentCore();
    void getAllCore(std::vector<CoreInfo>& out);
 };
 } // namespace ncnn

 #endif
 #endif
 #endif
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1432,12 +1432,21 @@ static std::vector<int> get_max_freq_mhz()

 static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
 #ifdef _WIN32
    GROUP_AFFINITY groupAffinity;
    ZeroMemory(&groupAffinity, sizeof(groupAffinity));
    groupAffinity.Group = static_cast<WORD>(thread_affinity_mask.cpu_group);
    groupAffinity.Mask = thread_affinity_mask.mask;

    SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL);
 #else
    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
    if (prev_mask == 0)
    {
        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
        return -1;
    }
 #endif

    return 0;
 }
@@ -2273,22 +2282,27 @@ CpuSet::CpuSet()

 void CpuSet::enable(int cpu)
 {
    mask |= ((ULONG_PTR)1 << cpu);
    cpu_group = cpu / 64;
    mask |= ((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }

 void CpuSet::disable(int cpu)
 {
    mask &= ~((ULONG_PTR)1 << cpu);
    cpu_group = cpu / 64;
    mask &= ~((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }

 void CpuSet::disable_all()
 {
    cpu_group = 0;
    mask = 0;
 }

 bool CpuSet::is_enabled(int cpu) const
 {
    return mask & ((ULONG_PTR)1 << cpu);
    if (cpu_group != cpu / 64)
        return false;
    return mask & ((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }

 int CpuSet::num_enabled() const
@@ -3273,4 +3287,38 @@ int set_flush_denormals(int flush_denormals)
 #endif
 }

 int get_multi_thread_batch()
 {
 #if defined(_NCNN_MUTITHREAD)
 #if defined _WIN32
    DWORD length = 0;
    GetLogicalProcessorInformation(NULL, &length);
    if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
        return 0;

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);

    int count = 0;
    if (GetLogicalProcessorInformation(buffer, &length))
    {
        DWORD offset = 0;
        while (offset < length)
        {
            if (buffer->Relationship == RelationProcessorCore)
                count++;

            offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
            buffer++;
        }
    }
    free(buffer);
    return count;
 #else
    return get_cpu_count();
 #endif
 #else
    return get_cpu_count();
 #endif
 }

 } // namespace ncnn
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -8,6 +8,7 @@

 #if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
 #define _WIN32_WINNT 0x0601 // Windows 7+
 #include <windows.h>
 #endif
 #if defined __ANDROID__ || defined __linux__
@@ -30,6 +31,7 @@ public:

 public:
 #if defined _WIN32
    int cpu_group;
    ULONG_PTR mask;
 #endif
 #if defined __ANDROID__ || defined __linux__
@@ -172,6 +174,9 @@ NCNN_EXPORT void set_kmp_blocktime(int time_ms);
 NCNN_EXPORT int get_flush_denormals();
 NCNN_EXPORT int set_flush_denormals(int flush_denormals);

 // multi thread batch inference
 NCNN_EXPORT int get_multi_thread_batch();

 } // namespace ncnn

 #endif // NCNN_CPU_H
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -98,6 +98,11 @@ int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) cons
    return -1;
 }

 int Layer::forward_thread(void* /*info*/) const
 {
    return -1;
 }

 #if NCNN_VULKAN
 int Layer::upload_model(VkTransfer& /*cmd*/, const Option& /*opt*/)
 {
--- a/src/layer.h
+++ b/src/layer.h
@@ -94,6 +94,10 @@ public:
    // return 0 if success
    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
    /// @brief mutithread work function
    /// @param workspace thread infomation
    /// @return 0 if success
    virtual int forward_thread(void* workspace);

 #if NCNN_VULKAN
 public:
@@ -139,6 +143,7 @@ public:
 // layer factory function
 typedef Layer* (*layer_creator_func)(void*);
 typedef void (*layer_destroyer_func)(Layer*, void*);
 typedef int (*layer_work_func)(Layer*, void*);

 struct layer_registry_entry
 {
--- a/src/layer/absval.cpp
+++ b/src/layer/absval.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: BSD-3-Clause

 #include "absval.h"
 #include "thread.h"

 namespace ncnn {

@@ -17,6 +18,16 @@ int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;
    if (opt.num_threads > 64)
    {
        ThreadWorkspace workspace;
        workspace.layer = (Layer*)this;
        MutilThread thread(workspace, opt);
        std::vector<Mat> workspace_blobs;
        workspace_blobs.push_back(bottom_top_blob);
        thread.join(workspace_blobs);
        return 0;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
@@ -33,4 +44,47 @@ int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    return 0;
 }

 int AbsVal::forward_thread(void* workspace)
 {
    ThreadInfoExc* info = (ThreadInfoExc*)workspace;
    Mat& bottom_top_blob = info->mats->at(0);
    if (bottom_top_blob.elemsize == 1)
    {
        int8_t* ptr = (int8_t*)bottom_top_blob.data;
        const int8_t flag = 1 << 7;
        for (size_t i = info->start_index; i < info->end_index; i++)
        {
            if (ptr[i] & flag)
            {
                ptr[i] = -ptr[i];
            }
        }
    }
    else if (bottom_top_blob.elemsize == 2)
    {
        int16_t* ptr = (int16_t*)bottom_top_blob.data;
        const int16_t flag = 1 << 15;
        for (size_t i = info->start_index; i < info->end_index; i++)
        {
            if (ptr[i] & flag)
            {
                ptr[i] = -ptr[i];
            }
        }
    }
    else
    {
        float* ptr = (float*)bottom_top_blob.data;
        for (size_t i = info->start_index; i < info->end_index; i++)
        {
            if (ptr[i] < 0)
            {
                ptr[i] = -ptr[i];
            }
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/absval.h
+++ b/src/layer/absval.h
@@ -14,6 +14,7 @@ public:
    AbsVal();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
    virtual int forward_thread(void* workspace);
 };

 } // namespace ncnn
--- a/src/layer/batchnorm.h
+++ b/src/layer/batchnorm.h
@@ -19,6 +19,8 @@ public:

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

    virtual int forward_thread(void* workspace);

 public:
    // param
    int channels;
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -57,6 +57,7 @@
 #cmakedefine01 NCNN_INT8
 #cmakedefine01 NCNN_BF16
 #cmakedefine01 NCNN_FORCE_INLINE
 #cmakedefine01 NCNN_MUTITHREAD

 #cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@"

--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -0,0 +1,164 @@
 #include "thread.h"
 #include "cpu.h"
 #if defined __ANDROID__ || defined __linux__
 #include <sched.h>
 #endif

 #if defined _WIN32
 DWORD WINAPI winWorker(LPVOID lpParam)
 {
    ncnn::ThreadInfoExc* info = (ncnn::ThreadInfoExc*)lpParam;
    if (info->coreinfo->group >= 0 && info->coreinfo->affinity != 0)
    {
        GROUP_AFFINITY groupAffinity;
        ZeroMemory(&groupAffinity, sizeof(groupAffinity));
        groupAffinity.Group = static_cast<WORD>(info->coreinfo->group);
        groupAffinity.Mask = info->coreinfo->affinity;

        SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL);
    }
    info->workspace->layer->forward_thread(info);
    info->manager->threadsComplete[info->threadid] = true;
    delete info;
    return 0;
 }
 #else
 void* pthreadWorker(void* lpParam)
 {
    ncnn::ThreadInfoExc* info = (ncnn::ThreadInfoExc*)lpParam;
 #if defined __ANDROID__ || defined __linux__
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(info->threadid, &cpuset);
    // 绑定到指定核心
    pthread_t current_thread = pthread_self();
    pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
 #endif
    info->workspace->layer->forward_thread(info);
    info->manager->threadsComplete[info->threadid] = true;
    delete info;
    return nullptr;
 }
 #endif
 namespace ncnn {
 MutilThread::MutilThread(ThreadWorkspace _workspace, const Option& opt)
 {
    workspace = _workspace;
    m_opt = opt;
    threadsComplete.resize(opt.num_threads);
    for (int i = 0; i < opt.num_threads; i++)
    {
        threadsComplete[i] = false;
    }
    threadsComplete[helpid] = true;
 }

 MutilThread::~MutilThread()
 {
    threadsComplete.clear();
 }

 void MutilThread::join(std::vector<Mat>& mats)
 {
 #if defined _WIN32
    Mat mat = mats[0];
    CoreInfo cur = ThreadInfo::get()->getCurrentCore();
    std::vector<CoreInfo> cores;
    ThreadInfo::get()->getAllCore(cores);
    std::vector<HANDLE> handles;
    ThreadInfoExc* curinfo = nullptr;
    size_t workersize = ((mat.w * mat.h * mat.d) / m_opt.num_threads + 1) * mat.c * mat.elemsize;
    size_t matlen = mats.size();
    for (int i = 0; i < m_opt.num_threads; i++)
    {
        ThreadInfoExc* info = new ThreadInfoExc();
        info->threadid = i;
        info->start_index = i * workersize;
        info->end_index = (i + 1) * workersize;
        if (info->end_index > matlen)
        {
            info->end_index = matlen;
        }
        info->workspace = &workspace;
        info->mats = &mats;
        info->opt = &m_opt;
        info->coreinfo = &cores[i];
        threadsComplete[i] = false;
        info->manager = this;
        if (cur.id == cores[i].id)
        {
            helpid = i;
            threadsComplete[i] = true;
            handles.push_back(nullptr);
            curinfo = info;
            continue;
        }
        handles.push_back(CreateThread(nullptr, 0, winWorker, info, 0, nullptr));
    }
    workspace.layer->forward_thread(curinfo);
    delete curinfo;
    bool check = true;
    do
    {
        check = false;
        for (int i = 0; i < m_opt.num_threads; i++)
        {
            if (threadsComplete[i] == false)
            {
                check = true;
                break;
            }
        }
    } while (check);
    for (size_t i = 0; i < handles.size(); i++)
    {
        if (handles[i] != nullptr)
        {
            CloseHandle(handles[i]);
        }
    }
    handles.clear();
 #else
    Mat mat = mats[0];
    int curid = -1;
 #if defined __ANDROID__ || defined __linux__
    curid = sched_getcpu();
 #endif

    std::vector<pthread_t> pthread_handles;
    ThreadInfoExc* curinfo = nullptr;
    size_t workersize = ((mat.w * mat.h * mat.d) / m_opt.num_threads + 1) * mat.c * mat.elemsize;
    size_t matlen = mats.size();
    for (int i = 0; i < m_opt.num_threads; i++)
    {
        ThreadInfoExc* info = new ThreadInfoExc();
        info->threadid = i;
        info->start_index = i * workersize;
        info->end_index = (i + 1) * workersize;
        if (info->end_index > matlen)
        {
            info->end_index = matlen;
        }
        info->workspace = &workspace;
        info->mats = &mats;
        info->opt = &m_opt;
        threadsComplete[i] = false;
        info->manager = this;
        if (curid == cores[i].id && curid > -1)
        {
            helpid = i;
            threadsComplete[i] = true;
            curinfo = info;
            continue;
        }
        pthread_handles.push_back(pthread_create(&pthread_handles[i], nullptr, pthreadWorker, info));
    }
    workspace.layer->forward_thread(curinfo);
    delete curinfo;
    for (size_t i = 0; i < pthread_handles.size(); i++)
    {
        pthread_join(pthread_handles[i], nullptr);
    }
 #endif
 }
 } // namespace ncnn
--- a/src/thread.h
+++ b/src/thread.h
@@ -0,0 +1,42 @@
 #ifndef THREAD_H
 #define THREAD_H
 #include "layer.h"
 #include "TheadInfo.h"
 #if defined __ANDROID__ || defined __linux__ || defined __APPLE__
 #include <pthread.h>
 #endif
 namespace ncnn {
 class MutilThread;
 struct ThreadWorkspace
 {
    Layer* layer;
 };
 struct ThreadInfoExc
 {
    int threadid;
    size_t start_index;
    size_t end_index;
    ThreadWorkspace* workspace;
    std::vector<ncnn::Mat>* mats;
    Option* opt;
    MutilThread* manager;
 #if defined _WIN32
    CoreInfo* coreinfo;
 #endif
 };
 class MutilThread
 {
 private:
    Option m_opt;
    volatile int helpid;
    ThreadWorkspace workspace;

 public:
    MutilThread(ThreadWorkspace _workspace, const Option& opt);
    void join(std::vector<ncnn::Mat>& mats);
    std::vector<bool> threadsComplete;
    ~MutilThread();
 };

 } // namespace ncnn
 #endif
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ ncnn_add_test(c_api)
 ncnn_add_test(cpu)
 ncnn_add_test(expression)
 ncnn_add_test(paramdict)
 ncnn_add_test(thread)

 if(NCNN_VULKAN)
    ncnn_add_test(command)
--- a/tests/test_thread.cpp
+++ b/tests/test_thread.cpp
@@ -0,0 +1,115 @@
 #include "testutil.h"
 #include "thread.h"

 class TestLayer : public ncnn::Layer
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt)
    {
        ThreadWorkspace workspace;
        workspace.layer = (Layer*)this;
        MutilThread thread(workspace, opt);
        std::vector<Mat> workspace_blobs;
        workspace_blobs.push_back(bottom_top_blob);
        thread.join(workspace_blobs);
        return 0;
    }
    virtual int forward_thread(void* workspace)
    {
        ThreadInfoExc* info = (ThreadInfoExc*)workspace;
        Mat& bottom_top_blob = info->mats->at(0);
        if (bottom_top_blob.elemsize == 1)
        {
            int8_t* ptr = (int8_t*)bottom_top_blob.data;
            const int8_t flag = 1 << 7;
            for (size_t i = info->start_index; i < info->end_index; i++)
            {
                if (ptr[i] & flag)
                {
                    ptr[i] = -ptr[i];
                }
            }
        }
        else if (bottom_top_blob.elemsize == 2)
        {
            int16_t* ptr = (int16_t*)bottom_top_blob.data;
            const int16_t flag = 1 << 15;
            for (size_t i = info->start_index; i < info->end_index; i++)
            {
                if (ptr[i] & flag)
                {
                    ptr[i] = -ptr[i];
                }
            }
        }
        else
        {
            float* ptr = (float*)bottom_top_blob.data;
            for (size_t i = info->start_index; i < info->end_index; i++)
            {
                if (ptr[i] < 0)
                {
                    ptr[i] = -ptr[i];
                }
            }
        }

        return 0;
    }
 };

 static int test_thread(const ncnn::Mat& a)
 {
    ncnn::ParamDict pd;

    std::vector<ncnn::Mat> weights(0);

    int ret = test_layer("TestLayer", pd, weights, a);
    if (ret != 0)
    {
        fprintf(stderr, "test_thread failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
    }

    return ret;
 }

 static int test_thread_0()
 {
    return 0
           || test_thread(RandomMat(5, 6, 7, 24))
           || test_thread(RandomMat(5, 6, 7, 12))
           || test_thread(RandomMat(5, 6, 7, 13));
 }

 static int test_thread_1()
 {
    return 0
           || test_thread(RandomMat(5, 7, 24))
           || test_thread(RandomMat(5, 6, 24))
           || test_thread(RandomMat(7, 9, 24));
 }

 static int test_thread_2()
 {
    return 0
           || test_thread(RandomMat(7, 12))
           || test_thread(RandomMat(5, 12))
           || test_thread(RandomMat(9, 12));
 }

 static int test_thread_3()
 {
    return 0
           || test_thread(RandomMat(7))
           || test_thread(RandomMat(128))
           || test_thread(RandomMat(256));
 }

 int main()
 {
    return 0
           || test_thread_0()
           || test_thread_1()
           || test_thread_2()
           || test_thread_3();
 }