From daa18abb191e5ae6cae8d22096fb9768ed056d34 Mon Sep 17 00:00:00 2001
From: DC Technology <412200533@qq.com>
Date: Tue, 22 Jul 2025 01:49:44 +0800
Subject: [PATCH 01/11] add threadUtillTools

---
 .gitignore            |   1 +
 CMakeLists.txt        |   1 +
 src/CMakeLists.txt    |  12 ++++
 src/TheadInfo.cpp     |  68 ++++++++++++++++++++
 src/TheadInfo.h       |  29 +++++++++
 src/cpu.cpp           |  34 ++++++++++
 src/cpu.h             |   5 ++
 src/layer.h           |   5 ++
 src/layer/absval.cpp  |  46 ++++++++++++++
 src/layer/absval.h    |   1 +
 src/layer/batchnorm.h |   2 +
 src/platform.h.in     |   1 +
 src/thread.cpp        | 143 ++++++++++++++++++++++++++++++++++++++++++
 src/thread.h          |  39 ++++++++++++
 14 files changed, 387 insertions(+)
 create mode 100644 src/TheadInfo.cpp
 create mode 100644 src/TheadInfo.h
 create mode 100644 src/thread.cpp
 create mode 100644 src/thread.h
diff --git a/.gitignore b/.gitignore
index cd69c526f..97e44879b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,4 @@ python/setup.py
 
 # Xmake
 .xmake/
+CMakePresets.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 800bf47ca..9a86c8ca5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ option(NCNN_PYTHON "build python api" OFF)
 option(NCNN_INT8 "int8 inference" ON)
 option(NCNN_BF16 "bf16 inference" ON)
 option(NCNN_FORCE_INLINE "force inline some function" ON)
+option(NCNN_MUTITHREAD "enable multi thread bata" ON)
 
 if(ANDROID OR IOS OR NCNN_SIMPLESTL)
     option(NCNN_DISABLE_RTTI "disable rtti" ON)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 261221104..57f1fbf42 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -48,6 +48,13 @@ if(ANDROID)
     list(APPEND ncnn_SRCS mat_pixel_android.cpp)
 endif()
 
+if(NCNN_MUTITHREAD)
+    list(APPEND ncnn_SRCS thread.cpp)
+    if(WIN32)
+        list(APPEND ncnn_SRCS ThreadInfo.cpp)
+    endif()
+endif()
+
 ncnn_src_group(ncnn_SRCS "sources")
 
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}")
@@ -266,6 +273,11 @@ if(NCNN_THREADS)
         target_link_libraries(ncnn PUBLIC pthread)
     endif()
 endif()
+if(NCNN_MUTITHREAD)
+    if(NOT WIN32 AND (NOT NCNN_SIMPLEOMP) AND (NOT NCNN_SIMPLESTL))
+        target_link_libraries(ncnn PUBLIC -pthread)
+    endif()
+endif()
 
 if(NCNN_VULKAN)
     if(NCNN_SIMPLEVK)
diff --git a/src/TheadInfo.cpp b/src/TheadInfo.cpp
new file mode 100644
index 000000000..01b3c9b92
--- /dev/null
+++ b/src/TheadInfo.cpp
@@ -0,0 +1,68 @@
+#ifdef NCNN_MUTITHREAD
+#ifdef _WIN32
+
+#include "TheadInfo.h"
+namespace ncnn
+{
+
+// 初始化静态成员
+ThreadInfo* ThreadInfo::thread_info = nullptr;
+
+ThreadInfo::ThreadInfo(/* args */)
+{
+    int groupCount = GetActiveProcessorGroupCount();
+    for (WORD group = 0; group < groupCount; group++) {
+        DWORD processorsInGroup = GetActiveProcessorCount(group);
+        for (int i = 0; i < static_cast<int>(processorsInGroup); i++) {
+            CoreInfo info;
+            info.group = group;
+            info.id = i + core_infos.size();
+            info.affinity = (static_cast<DWORD_PTR>(1) << i);
+            core_infos.push_back(info);
+        }
+    }
+}
+
+ThreadInfo* ThreadInfo::get()
+{
+    static Mutex lock;
+    AutoLock guard(lock);
+    
+    if (!thread_info)
+    {
+        thread_info = new ThreadInfo();
+    }
+    return thread_info;
+}
+
+CoreInfo ThreadInfo::getCurrentCore()
+{
+    // 获取当前线程运行的CPU核心（支持多处理器组）
+    DWORD_PTR process_affinity, system_affinity;
+    GetProcessAffinityMask(GetCurrentProcess(), &process_affinity, &system_affinity);
+    
+    // 使用扩展API获取处理器组信息
+    PROCESSOR_NUMBER proc_num;
+    GetCurrentProcessorNumberEx(&proc_num);
+    
+    for (const auto& core : core_infos)
+    {
+        // 匹配组号和组内核心编号
+        if (core.group == proc_num.Group && (core.affinity & (1ULL << proc_num.Number)))
+        {
+            return core;
+        }
+    }
+    
+    // 未找到时返回默认值
+    return { -1, -1, 0 };
+}
+
+void ThreadInfo::getAllCore(std::vector<CoreInfo>& out)
+{
+    out = core_infos;
+}
+}
+
+#endif
+#endif
diff --git a/src/TheadInfo.h b/src/TheadInfo.h
new file mode 100644
index 000000000..6dd0669bf
--- /dev/null
+++ b/src/TheadInfo.h
@@ -0,0 +1,29 @@
+#ifndef THREAD_INFO_H
+#define THREAD_INFO_H
+#ifdef NCNN_MUTITHREAD
+#if defined _WIN32
+#include "cpu.h"
+namespace ncnn
+{
+struct CoreInfo{
+    public:
+    int id;
+    int group;
+    DWORD_PTR affinity;
+};
+class TheadInfo
+{
+private:
+    static ThreadInfo* thread_info;
+    std::vector<CoreInfo> core_infos;
+    TheadInfo(/* args */);
+public:
+    static ThreadInfo* get();
+    CoreInfo getCurrentCore();
+    void getAllCore(std::vector<CoreInfo>& out);
+};
+}
+
+#endif
+#endif
+#endif
\ No newline at end of file
diff --git a/src/cpu.cpp b/src/cpu.cpp
index 9f91812b9..021aa5a1f 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -3266,4 +3266,38 @@ int set_flush_denormals(int flush_denormals)
 #endif
 }
 
+int get_multi_thread_batch(){
+#if defined(_NCNN_MUTITHREAD)
+    #if defined _WIN32
+     DWORD length = 0;
+    GetLogicalProcessorInformation(NULL, &length);
+    if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+        return 0;
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = 
+        (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
+        
+    int count = 0;
+    if (GetLogicalProcessorInformation(buffer, &length))
+    {
+        DWORD offset = 0;
+        while (offset < length)
+        {
+            if (buffer->Relationship == RelationProcessorCore)
+                count++;
+            
+            offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+            buffer++;
+        }
+    }
+    free(buffer);
+    return count;
+    #else
+    return get_cpu_count();
+    #endif
+#else
+    return get_cpu_count();
+#endif
+}
+
 } // namespace ncnn
diff --git a/src/cpu.h b/src/cpu.h
index cbf417111..7ffd2f6fc 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -8,6 +8,7 @@
 
 #if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
+#define _WIN32_WINNT 0x0601 // Windows 7+
 #include <windows.h>
 #endif
 #if defined __ANDROID__ || defined __linux__
@@ -172,6 +173,10 @@ NCNN_EXPORT void set_kmp_blocktime(int time_ms);
 NCNN_EXPORT int get_flush_denormals();
 NCNN_EXPORT int set_flush_denormals(int flush_denormals);
 
+
+// multi thread batch inference
+NCNN_EXPORT int get_multi_thread_batch();
+
 } // namespace ncnn
 
 #endif // NCNN_CPU_H
diff --git a/src/layer.h b/src/layer.h
index 5351de1c0..c65656d12 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -94,6 +94,10 @@ public:
     // return 0 if success
     virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+    /// @brief mutithread work function
+    /// @param workspace thread infomation
+    /// @return 0 if success
+    virtual int forward_thread(void* workspace);
 
 #if NCNN_VULKAN
 public:
@@ -139,6 +143,7 @@ public:
 // layer factory function
 typedef Layer* (*layer_creator_func)(void*);
 typedef void (*layer_destroyer_func)(Layer*, void*);
+typedef int (*layer_work_func)(Layer*,void*);
 
 struct layer_registry_entry
 {
diff --git a/src/layer/absval.cpp b/src/layer/absval.cpp
index 2f38d3520..2d1cea4c5 100644
--- a/src/layer/absval.cpp
+++ b/src/layer/absval.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "absval.h"
+#include "thread.h"
 
 namespace ncnn {
 
@@ -17,6 +18,15 @@ int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
+    if(opt.num_threads > 64){
+        ThreadWorkspace workspace;
+        workspace.layer = (Layer*)this;
+        MutilThread thread(workspace,opt);
+        std::vector<Mat> workspace_blobs;
+        workspace_blobs.push_back(bottom_top_blob);
+        thread.join(workspace_blobs);
+        return 0;
+    }
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int q = 0; q < channels; q++)
@@ -33,4 +43,40 @@ int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int AbsVal::forward_thread(void* workspace)
+{
+    ThreadInfoExc* info = (ThreadInfoExc*)workspace;
+    Mat& bottom_top_blob = info->mats->at(0);
+    if(bottom_top_blob.elemsize==1){
+        int8_t* ptr = (int8_t*)bottom_top_blob.data;
+        const int8_t flag = 1<<7;
+        for (size_t i = info->start_index; i < info->end_index; i++)
+        {
+            if(ptr[i]&flag){
+                ptr[i] = -ptr[i];
+            }
+        }
+    }else if (bottom_top_blob.elemsize==2)
+    {
+        int16_t* ptr = (int16_t*)bottom_top_blob.data;
+        const int16_t flag = 1<<15;
+        for (size_t i = info->start_index; i < info->end_index; i++)
+        {
+            if(ptr[i]&flag){
+                ptr[i] = -ptr[i];
+            }
+        }
+    }else{
+        float* ptr = (float*)bottom_top_blob.data;
+        for (size_t i = info->start_index; i < info->end_index; i++)
+        {
+            if(ptr[i]<0){
+                ptr[i] = -ptr[i];
+            }
+        }
+    }
+    
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/absval.h b/src/layer/absval.h
index deb9540d0..619cfeb64 100644
--- a/src/layer/absval.h
+++ b/src/layer/absval.h
@@ -14,6 +14,7 @@ public:
     AbsVal();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+    virtual int forward_thread(void* workspace);
 };
 
 } // namespace ncnn
diff --git a/src/layer/batchnorm.h b/src/layer/batchnorm.h
index 6043d0e41..0deedba46 100644
--- a/src/layer/batchnorm.h
+++ b/src/layer/batchnorm.h
@@ -19,6 +19,8 @@ public:
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
+    virtual int forward_thread(void* workspace);
+
 public:
     // param
     int channels;
diff --git a/src/platform.h.in b/src/platform.h.in
index 8b7357eec..79a79db04 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -57,6 +57,7 @@
 #cmakedefine01 NCNN_INT8
 #cmakedefine01 NCNN_BF16
 #cmakedefine01 NCNN_FORCE_INLINE
+#cmakedefine01 NCNN_MUTITHREAD
 
 #cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@"
 
diff --git a/src/thread.cpp b/src/thread.cpp
new file mode 100644
index 000000000..8da515902
--- /dev/null
+++ b/src/thread.cpp
@@ -0,0 +1,143 @@
+#include "thread.h"
+#include "cpu.h"
+
+#if defined _WIN32
+DWORD WINAPI winWorker(LPVOID lpParam)
+{
+    ncnn::ThreadInfoExc* info = (ncnn::ThreadInfoExc*)lpParam;
+    if (info->coreinfo->group >= 0 && info->coreinfo->affinity != 0) {
+        GROUP_AFFINITY groupAffinity;
+        ZeroMemory(&groupAffinity, sizeof(groupAffinity));
+        groupAffinity.Group = static_cast<WORD>(info->coreinfo->group);
+        groupAffinity.Mask = info->coreinfo->affinity;
+        
+        return SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL) != 0;
+    }
+    info->workspace->layer->forward_thread(info); 
+    info->manager->threadsComplete[info->threadid] = true;
+    delete info;
+    return 0;
+}
+#else
+void* pthreadWorker(void* lpParam)
+{
+    ncnn::ThreadInfoExc* info = (ncnn::ThreadInfoExc*)lpParam;
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(info->threadid, &cpuset);
+    
+    // 绑定到指定核心
+    pthread_t current_thread = pthread_self();
+    pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
+    info->workspace->layer->forward_thread(info); 
+    info->manager->threadsComplete[info->threadid] = true;
+    delete info;
+    return nullptr;
+}
+#endif
+namespace ncnn
+{
+MutilThread::MutilThread(ThreadWorkspace _workspace, const Option& opt)
+{
+    workspace = _workspace;
+    m_opt = opt;
+    threadsComplete.resize(opt.num_threads);
+    for(int i=0;i<opt.num_threads;i++){
+        threadsComplete[i] = false;
+    }
+    threadsComplete[helpid] = true;
+}
+
+MutilThread::~MutilThread()
+{
+    threadsComplete.clear();
+}
+
+void MutilThread::join(std::vector<Mat>& mats)
+{
+    #if defined _WIN32
+    Mat mat = mats[0];
+    CoreInfo cur = TheadInfo::get()->getCurrentCore();
+    std::vector<CoreInfo> cores;
+    TheadInfo::get()->getAllCore(cores);
+    std::vector<HANDLE> handles;
+    ThreadInfoExc* curinfo = nullptr;
+    size_t workersize = ((mat.w*mat.h*mat.d)/m_opt.num_threads +1)*mat.c*mat.elemsize;
+    size_t matlen = mats.size();
+    for(int i=0;i<m_opt.num_threads;i++){
+        ThreadInfoExc *info = new ThreadInfoExc();
+        info->threadid = i;
+        info->start_index = i*workersize;
+        info->end_index = (i+1)*workersize;
+        if(info->end_index>matlen){
+            info->end_index = matlen;
+        }
+        info->workspace = &workspace;
+        info->mats = &mats;
+        info->opt = &m_opt;
+        info->coreinfo = &cores[i];
+        threadsComplete[i] = false;
+        info->manager = this;
+        if(cur.id==cores[i].id){
+            helpid = i;
+            threadsComplete[i] = true;
+            handles.push_back(nullptr);
+            curinfo = info;
+            continue;
+        }
+        handles.push_back(CreateThread(nullptr,0,winWorker,info,0,nullptr));
+    }
+    workspace.layer->forward_inplace(curinfo);
+    delete curinfo;
+    bool check = true;
+    do{
+        check = false;
+        for(int i=0;i<m_opt.num_threads;i++){
+            if(threadsComplete[i]==false){
+                check = true;
+                break;
+            }
+        }
+    }while(check);
+    for (size_t i = 0; i < handles.size(); i++)
+    {
+        if(handles[i]!=nullptr){
+            CloseHandle(handles[i]);
+        }
+    }
+    handles.clear();
+    #else
+    std::vector<pthread_t> pthread_handles;
+    ThreadInfoExc* curinfo = nullptr;
+    size_t workersize = ((mat.w*mat.h*mat.d)/m_opt.num_threads +1)*mat.c*mat.elemsize;
+    size_t matlen = mats.size();
+    for(int i=0;i<m_opt.num_threads;i++){
+        ThreadInfoExc *info = new ThreadInfoExc();
+        info->threadid = i;
+        info->start_index = i*workersize;
+        info->end_index = (i+1)*workersize;
+        if(info->end_index>matlen){
+            info->end_index = matlen;
+        }
+        info->workspace = &workspace;
+        info->mats = &mats;
+        info->opt = &m_opt;
+        threadsComplete[i] = false;
+        info->manager = this;
+        if(cur.id==cores[i].id){
+            helpid = i;
+            threadsComplete[i] = true;
+            curinfo = info;
+            continue;
+        }
+        pthread_handles.push_back(pthread_create(&pthread_handles[i], nullptr, pthreadWorker, info));
+    }
+    workspace.layer->forward_inplace(curinfo);
+    delete curinfo;
+    for (size_t i = 0; i < pthread_handles.size(); i++)
+    {
+        pthread_join(pthread_handles[i], nullptr);
+    }
+    #endif
+}
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/thread.h b/src/thread.h
new file mode 100644
index 000000000..7e6a43773
--- /dev/null
+++ b/src/thread.h
@@ -0,0 +1,39 @@
+#ifndef THREAD_H
+#define THREAD_H
+#include "layer.h"
+#include "TheadInfo.h"
+#if defined __ANDROID__ || defined __linux__ || defined __APPLE__
+#include <pthread.h>
+#endif
+namespace ncnn
+{
+    struct ThreadInfoExc{
+        int threadid;
+        size_t start_index;
+        size_t end_index;
+        ThreadWorkspace* workspace;
+        std::vector<ncnn::Mat>* mats;
+        Option* opt;
+        MutilThread* manager;
+        #if defined _WIN32
+        CoreInfo* coreinfo;
+        #endif
+    };
+    struct ThreadWorkspace{
+        Layer* layer;
+    };
+    class MutilThread
+    {
+    private:
+        Option m_opt;
+        volatile int helpid;
+        ThreadWorkspace workspace;
+    public:
+        MutilThread(ThreadWorkspace _workspace,const Option& opt);
+        void join(std::vector<ncnn::Mat>& mats);
+        std::vector<bool> threadsComplete;
+        ~MutilThread();
+    };
+    
+} // namespace ncnn
+#endif

From 9c5280034a386385e5c4373506099d7f5d4bd35a Mon Sep 17 00:00:00 2001
From: DaChengTechnology <12637177+DaChengTechnology@users.noreply.github.com>
Date: Mon, 21 Jul 2025 18:12:26 +0000
Subject: [PATCH 02/11] apply code-format changes

---
 src/TheadInfo.cpp    | 21 +++++++------
 src/TheadInfo.h      |  9 +++---
 src/cpu.cpp          | 18 +++++------
 src/cpu.h            |  1 -
 src/layer.h          |  2 +-
 src/layer/absval.cpp | 30 +++++++++++-------
 src/thread.cpp       | 73 +++++++++++++++++++++++++-------------------
 src/thread.h         | 60 ++++++++++++++++++------------------
 8 files changed, 118 insertions(+), 96 deletions(-)

diff --git a/src/TheadInfo.cpp b/src/TheadInfo.cpp
index 01b3c9b92..f49f6eb8a 100644
--- a/src/TheadInfo.cpp
+++ b/src/TheadInfo.cpp
@@ -2,8 +2,7 @@
 #ifdef _WIN32
 
 #include "TheadInfo.h"
-namespace ncnn
-{
+namespace ncnn {
 
 // 初始化静态成员
 ThreadInfo* ThreadInfo::thread_info = nullptr;
@@ -11,9 +10,11 @@ ThreadInfo* ThreadInfo::thread_info = nullptr;
 ThreadInfo::ThreadInfo(/* args */)
 {
     int groupCount = GetActiveProcessorGroupCount();
-    for (WORD group = 0; group < groupCount; group++) {
+    for (WORD group = 0; group < groupCount; group++)
+    {
         DWORD processorsInGroup = GetActiveProcessorCount(group);
-        for (int i = 0; i < static_cast<int>(processorsInGroup); i++) {
+        for (int i = 0; i < static_cast<int>(processorsInGroup); i++)
+        {
             CoreInfo info;
             info.group = group;
             info.id = i + core_infos.size();
@@ -27,7 +28,7 @@ ThreadInfo* ThreadInfo::get()
 {
     static Mutex lock;
     AutoLock guard(lock);
-    
+
     if (!thread_info)
     {
         thread_info = new ThreadInfo();
@@ -40,11 +41,11 @@ CoreInfo ThreadInfo::getCurrentCore()
     // 获取当前线程运行的CPU核心（支持多处理器组）
     DWORD_PTR process_affinity, system_affinity;
     GetProcessAffinityMask(GetCurrentProcess(), &process_affinity, &system_affinity);
-    
+
     // 使用扩展API获取处理器组信息
     PROCESSOR_NUMBER proc_num;
     GetCurrentProcessorNumberEx(&proc_num);
-    
+
     for (const auto& core : core_infos)
     {
         // 匹配组号和组内核心编号
@@ -53,16 +54,16 @@ CoreInfo ThreadInfo::getCurrentCore()
             return core;
         }
     }
-    
+
     // 未找到时返回默认值
-    return { -1, -1, 0 };
+    return {-1, -1, 0};
 }
 
 void ThreadInfo::getAllCore(std::vector<CoreInfo>& out)
 {
     out = core_infos;
 }
-}
+} // namespace ncnn
 
 #endif
 #endif
diff --git a/src/TheadInfo.h b/src/TheadInfo.h
index 6dd0669bf..25f1b74c3 100644
--- a/src/TheadInfo.h
+++ b/src/TheadInfo.h
@@ -3,10 +3,10 @@
 #ifdef NCNN_MUTITHREAD
 #if defined _WIN32
 #include "cpu.h"
-namespace ncnn
+namespace ncnn {
+struct CoreInfo
 {
-struct CoreInfo{
-    public:
+public:
     int id;
     int group;
     DWORD_PTR affinity;
@@ -17,12 +17,13 @@ private:
     static ThreadInfo* thread_info;
     std::vector<CoreInfo> core_infos;
     TheadInfo(/* args */);
+
 public:
     static ThreadInfo* get();
     CoreInfo getCurrentCore();
     void getAllCore(std::vector<CoreInfo>& out);
 };
-}
+} // namespace ncnn
 
 #endif
 #endif
diff --git a/src/cpu.cpp b/src/cpu.cpp
index 021aa5a1f..022981259 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -3266,17 +3266,17 @@ int set_flush_denormals(int flush_denormals)
 #endif
 }
 
-int get_multi_thread_batch(){
+int get_multi_thread_batch()
+{
 #if defined(_NCNN_MUTITHREAD)
-    #if defined _WIN32
-     DWORD length = 0;
+#if defined _WIN32
+    DWORD length = 0;
     GetLogicalProcessorInformation(NULL, &length);
     if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
         return 0;
 
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = 
-        (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
-        
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
+
     int count = 0;
     if (GetLogicalProcessorInformation(buffer, &length))
     {
@@ -3285,16 +3285,16 @@ int get_multi_thread_batch(){
         {
             if (buffer->Relationship == RelationProcessorCore)
                 count++;
-            
+
             offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
             buffer++;
         }
     }
     free(buffer);
     return count;
-    #else
+#else
     return get_cpu_count();
-    #endif
+#endif
 #else
     return get_cpu_count();
 #endif
diff --git a/src/cpu.h b/src/cpu.h
index 7ffd2f6fc..cf0f8e87e 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -173,7 +173,6 @@ NCNN_EXPORT void set_kmp_blocktime(int time_ms);
 NCNN_EXPORT int get_flush_denormals();
 NCNN_EXPORT int set_flush_denormals(int flush_denormals);
 
-
 // multi thread batch inference
 NCNN_EXPORT int get_multi_thread_batch();
 
diff --git a/src/layer.h b/src/layer.h
index c65656d12..5bfd58742 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -143,7 +143,7 @@ public:
 // layer factory function
 typedef Layer* (*layer_creator_func)(void*);
 typedef void (*layer_destroyer_func)(Layer*, void*);
-typedef int (*layer_work_func)(Layer*,void*);
+typedef int (*layer_work_func)(Layer*, void*);
 
 struct layer_registry_entry
 {
diff --git a/src/layer/absval.cpp b/src/layer/absval.cpp
index 2d1cea4c5..61e355dda 100644
--- a/src/layer/absval.cpp
+++ b/src/layer/absval.cpp
@@ -18,10 +18,11 @@ int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
-    if(opt.num_threads > 64){
+    if (opt.num_threads > 64)
+    {
         ThreadWorkspace workspace;
         workspace.layer = (Layer*)this;
-        MutilThread thread(workspace,opt);
+        MutilThread thread(workspace, opt);
         std::vector<Mat> workspace_blobs;
         workspace_blobs.push_back(bottom_top_blob);
         thread.join(workspace_blobs);
@@ -47,35 +48,42 @@ int AbsVal::forward_thread(void* workspace)
 {
     ThreadInfoExc* info = (ThreadInfoExc*)workspace;
     Mat& bottom_top_blob = info->mats->at(0);
-    if(bottom_top_blob.elemsize==1){
+    if (bottom_top_blob.elemsize == 1)
+    {
         int8_t* ptr = (int8_t*)bottom_top_blob.data;
-        const int8_t flag = 1<<7;
+        const int8_t flag = 1 << 7;
         for (size_t i = info->start_index; i < info->end_index; i++)
         {
-            if(ptr[i]&flag){
+            if (ptr[i] & flag)
+            {
                 ptr[i] = -ptr[i];
             }
         }
-    }else if (bottom_top_blob.elemsize==2)
+    }
+    else if (bottom_top_blob.elemsize == 2)
     {
         int16_t* ptr = (int16_t*)bottom_top_blob.data;
-        const int16_t flag = 1<<15;
+        const int16_t flag = 1 << 15;
         for (size_t i = info->start_index; i < info->end_index; i++)
         {
-            if(ptr[i]&flag){
+            if (ptr[i] & flag)
+            {
                 ptr[i] = -ptr[i];
             }
         }
-    }else{
+    }
+    else
+    {
         float* ptr = (float*)bottom_top_blob.data;
         for (size_t i = info->start_index; i < info->end_index; i++)
         {
-            if(ptr[i]<0){
+            if (ptr[i] < 0)
+            {
                 ptr[i] = -ptr[i];
             }
         }
     }
-    
+
     return 0;
 }
 
diff --git a/src/thread.cpp b/src/thread.cpp
index 8da515902..587c67666 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -5,15 +5,16 @@
 DWORD WINAPI winWorker(LPVOID lpParam)
 {
     ncnn::ThreadInfoExc* info = (ncnn::ThreadInfoExc*)lpParam;
-    if (info->coreinfo->group >= 0 && info->coreinfo->affinity != 0) {
+    if (info->coreinfo->group >= 0 && info->coreinfo->affinity != 0)
+    {
         GROUP_AFFINITY groupAffinity;
         ZeroMemory(&groupAffinity, sizeof(groupAffinity));
         groupAffinity.Group = static_cast<WORD>(info->coreinfo->group);
         groupAffinity.Mask = info->coreinfo->affinity;
-        
+
         return SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL) != 0;
     }
-    info->workspace->layer->forward_thread(info); 
+    info->workspace->layer->forward_thread(info);
     info->manager->threadsComplete[info->threadid] = true;
     delete info;
     return 0;
@@ -25,24 +26,24 @@ void* pthreadWorker(void* lpParam)
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
     CPU_SET(info->threadid, &cpuset);
-    
+
     // 绑定到指定核心
     pthread_t current_thread = pthread_self();
     pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
-    info->workspace->layer->forward_thread(info); 
+    info->workspace->layer->forward_thread(info);
     info->manager->threadsComplete[info->threadid] = true;
     delete info;
     return nullptr;
 }
 #endif
-namespace ncnn
-{
+namespace ncnn {
 MutilThread::MutilThread(ThreadWorkspace _workspace, const Option& opt)
 {
     workspace = _workspace;
     m_opt = opt;
     threadsComplete.resize(opt.num_threads);
-    for(int i=0;i<opt.num_threads;i++){
+    for (int i = 0; i < opt.num_threads; i++)
+    {
         threadsComplete[i] = false;
     }
     threadsComplete[helpid] = true;
@@ -55,21 +56,23 @@ MutilThread::~MutilThread()
 
 void MutilThread::join(std::vector<Mat>& mats)
 {
-    #if defined _WIN32
+#if defined _WIN32
     Mat mat = mats[0];
     CoreInfo cur = TheadInfo::get()->getCurrentCore();
     std::vector<CoreInfo> cores;
     TheadInfo::get()->getAllCore(cores);
     std::vector<HANDLE> handles;
     ThreadInfoExc* curinfo = nullptr;
-    size_t workersize = ((mat.w*mat.h*mat.d)/m_opt.num_threads +1)*mat.c*mat.elemsize;
+    size_t workersize = ((mat.w * mat.h * mat.d) / m_opt.num_threads + 1) * mat.c * mat.elemsize;
     size_t matlen = mats.size();
-    for(int i=0;i<m_opt.num_threads;i++){
-        ThreadInfoExc *info = new ThreadInfoExc();
+    for (int i = 0; i < m_opt.num_threads; i++)
+    {
+        ThreadInfoExc* info = new ThreadInfoExc();
         info->threadid = i;
-        info->start_index = i*workersize;
-        info->end_index = (i+1)*workersize;
-        if(info->end_index>matlen){
+        info->start_index = i * workersize;
+        info->end_index = (i + 1) * workersize;
+        if (info->end_index > matlen)
+        {
             info->end_index = matlen;
         }
         info->workspace = &workspace;
@@ -78,45 +81,52 @@ void MutilThread::join(std::vector<Mat>& mats)
         info->coreinfo = &cores[i];
         threadsComplete[i] = false;
         info->manager = this;
-        if(cur.id==cores[i].id){
+        if (cur.id == cores[i].id)
+        {
             helpid = i;
             threadsComplete[i] = true;
             handles.push_back(nullptr);
             curinfo = info;
             continue;
         }
-        handles.push_back(CreateThread(nullptr,0,winWorker,info,0,nullptr));
+        handles.push_back(CreateThread(nullptr, 0, winWorker, info, 0, nullptr));
     }
     workspace.layer->forward_inplace(curinfo);
     delete curinfo;
     bool check = true;
-    do{
+    do
+    {
         check = false;
-        for(int i=0;i<m_opt.num_threads;i++){
-            if(threadsComplete[i]==false){
+        for (int i = 0; i < m_opt.num_threads; i++)
+        {
+            if (threadsComplete[i] == false)
+            {
                 check = true;
                 break;
             }
         }
-    }while(check);
+    } while (check);
     for (size_t i = 0; i < handles.size(); i++)
     {
-        if(handles[i]!=nullptr){
+        if (handles[i] != nullptr)
+        {
             CloseHandle(handles[i]);
         }
     }
     handles.clear();
-    #else
+#else
     std::vector<pthread_t> pthread_handles;
     ThreadInfoExc* curinfo = nullptr;
-    size_t workersize = ((mat.w*mat.h*mat.d)/m_opt.num_threads +1)*mat.c*mat.elemsize;
+    size_t workersize = ((mat.w * mat.h * mat.d) / m_opt.num_threads + 1) * mat.c * mat.elemsize;
     size_t matlen = mats.size();
-    for(int i=0;i<m_opt.num_threads;i++){
-        ThreadInfoExc *info = new ThreadInfoExc();
+    for (int i = 0; i < m_opt.num_threads; i++)
+    {
+        ThreadInfoExc* info = new ThreadInfoExc();
         info->threadid = i;
-        info->start_index = i*workersize;
-        info->end_index = (i+1)*workersize;
-        if(info->end_index>matlen){
+        info->start_index = i * workersize;
+        info->end_index = (i + 1) * workersize;
+        if (info->end_index > matlen)
+        {
             info->end_index = matlen;
         }
         info->workspace = &workspace;
@@ -124,7 +134,8 @@ void MutilThread::join(std::vector<Mat>& mats)
         info->opt = &m_opt;
         threadsComplete[i] = false;
         info->manager = this;
-        if(cur.id==cores[i].id){
+        if (cur.id == cores[i].id)
+        {
             helpid = i;
             threadsComplete[i] = true;
             curinfo = info;
@@ -138,6 +149,6 @@ void MutilThread::join(std::vector<Mat>& mats)
     {
         pthread_join(pthread_handles[i], nullptr);
     }
-    #endif
+#endif
 }
 } // namespace ncnn
\ No newline at end of file
diff --git a/src/thread.h b/src/thread.h
index 7e6a43773..163555cfe 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -5,35 +5,37 @@
 #if defined __ANDROID__ || defined __linux__ || defined __APPLE__
 #include <pthread.h>
 #endif
-namespace ncnn
+namespace ncnn {
+struct ThreadInfoExc
 {
-    struct ThreadInfoExc{
-        int threadid;
-        size_t start_index;
-        size_t end_index;
-        ThreadWorkspace* workspace;
-        std::vector<ncnn::Mat>* mats;
-        Option* opt;
-        MutilThread* manager;
-        #if defined _WIN32
-        CoreInfo* coreinfo;
-        #endif
-    };
-    struct ThreadWorkspace{
-        Layer* layer;
-    };
-    class MutilThread
-    {
-    private:
-        Option m_opt;
-        volatile int helpid;
-        ThreadWorkspace workspace;
-    public:
-        MutilThread(ThreadWorkspace _workspace,const Option& opt);
-        void join(std::vector<ncnn::Mat>& mats);
-        std::vector<bool> threadsComplete;
-        ~MutilThread();
-    };
-    
+    int threadid;
+    size_t start_index;
+    size_t end_index;
+    ThreadWorkspace* workspace;
+    std::vector<ncnn::Mat>* mats;
+    Option* opt;
+    MutilThread* manager;
+#if defined _WIN32
+    CoreInfo* coreinfo;
+#endif
+};
+struct ThreadWorkspace
+{
+    Layer* layer;
+};
+class MutilThread
+{
+private:
+    Option m_opt;
+    volatile int helpid;
+    ThreadWorkspace workspace;
+
+public:
+    MutilThread(ThreadWorkspace _workspace, const Option& opt);
+    void join(std::vector<ncnn::Mat>& mats);
+    std::vector<bool> threadsComplete;
+    ~MutilThread();
+};
+
 } // namespace ncnn
 #endif

From 2dc940de7fade4cfc5f7aba6d8784b4784080855 Mon Sep 17 00:00:00 2001
From: DC Technology <412200533@qq.com>
Date: Tue, 22 Jul 2025 02:13:36 +0800
Subject: [PATCH 03/11] fixt hread.h

---
 src/thread.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/thread.h b/src/thread.h
index 7e6a43773..62f0c1c44 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -7,6 +7,9 @@
 #endif
 namespace ncnn
 {
+    struct ThreadWorkspace{
+        Layer* layer;
+    };
     struct ThreadInfoExc{
         int threadid;
         size_t start_index;
@@ -19,9 +22,6 @@ namespace ncnn
         CoreInfo* coreinfo;
         #endif
     };
-    struct ThreadWorkspace{
-        Layer* layer;
-    };
     class MutilThread
     {
     private:

From 79a768dbdd64804307a11d7bc487c6d0c5faea23 Mon Sep 17 00:00:00 2001
From: DC Technology <412200533@qq.com>
Date: Tue, 22 Jul 2025 02:17:47 +0800
Subject: [PATCH 04/11] fix

---
 src/thread.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/thread.h b/src/thread.h
index 788e86c05..bb97cb7d6 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -6,8 +6,6 @@
 #include <pthread.h>
 #endif
 namespace ncnn {
-struct ThreadInfoExc
-{
     struct ThreadWorkspace{
         Layer* layer;
     };

From 6d98cc39ce86118ee899171fca6f4349dc8f6e73 Mon Sep 17 00:00:00 2001
From: DaChengTechnology <12637177+DaChengTechnology@users.noreply.github.com>
Date: Mon, 21 Jul 2025 18:19:57 +0000
Subject: [PATCH 05/11] apply code-format changes

---
 src/thread.h | 59 +++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/src/thread.h b/src/thread.h
index bb97cb7d6..6e6fc30a2 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -6,33 +6,36 @@
 #include <pthread.h>
 #endif
 namespace ncnn {
-    struct ThreadWorkspace{
-        Layer* layer;
-    };
-    struct ThreadInfoExc{
-        int threadid;
-        size_t start_index;
-        size_t end_index;
-        ThreadWorkspace* workspace;
-        std::vector<ncnn::Mat>* mats;
-        Option* opt;
-        MutilThread* manager;
-        #if defined _WIN32
-        CoreInfo* coreinfo;
-        #endif
-    };
-    class MutilThread
-    {
-    private:
-        Option m_opt;
-        volatile int helpid;
-        ThreadWorkspace workspace;
-    public:
-        MutilThread(ThreadWorkspace _workspace,const Option& opt);
-        void join(std::vector<ncnn::Mat>& mats);
-        std::vector<bool> threadsComplete;
-        ~MutilThread();
-    };
-    
+struct ThreadWorkspace
+{
+    Layer* layer;
+};
+struct ThreadInfoExc
+{
+    int threadid;
+    size_t start_index;
+    size_t end_index;
+    ThreadWorkspace* workspace;
+    std::vector<ncnn::Mat>* mats;
+    Option* opt;
+    MutilThread* manager;
+#if defined _WIN32
+    CoreInfo* coreinfo;
+#endif
+};
+class MutilThread
+{
+private:
+    Option m_opt;
+    volatile int helpid;
+    ThreadWorkspace workspace;
+
+public:
+    MutilThread(ThreadWorkspace _workspace, const Option& opt);
+    void join(std::vector<ncnn::Mat>& mats);
+    std::vector<bool> threadsComplete;
+    ~MutilThread();
+};
+
 } // namespace ncnn
 #endif

From 1d2bf3ca5939c214f0d49e0738919bcbf066a87c Mon Sep 17 00:00:00 2001
From: DC Technology <412200533@qq.com>
Date: Tue, 22 Jul 2025 13:14:30 +0800
Subject: [PATCH 06/11] add define

---
 src/thread.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/thread.h b/src/thread.h
index bb97cb7d6..cf97ed6fe 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -6,6 +6,7 @@
 #include <pthread.h>
 #endif
 namespace ncnn {
+    class MutilThread;
     struct ThreadWorkspace{
         Layer* layer;
     };

From dd710680457f83399ee11762f634aa01b7de4fde Mon Sep 17 00:00:00 2001
From: DaChengTechnology <12637177+DaChengTechnology@users.noreply.github.com>
Date: Tue, 22 Jul 2025 05:29:44 +0000
Subject: [PATCH 07/11] apply code-format changes

---
 src/thread.h | 61 +++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/src/thread.h b/src/thread.h
index cf97ed6fe..9f12c71d4 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -6,34 +6,37 @@
 #include <pthread.h>
 #endif
 namespace ncnn {
-    class MutilThread;
-    struct ThreadWorkspace{
-        Layer* layer;
-    };
-    struct ThreadInfoExc{
-        int threadid;
-        size_t start_index;
-        size_t end_index;
-        ThreadWorkspace* workspace;
-        std::vector<ncnn::Mat>* mats;
-        Option* opt;
-        MutilThread* manager;
-        #if defined _WIN32
-        CoreInfo* coreinfo;
-        #endif
-    };
-    class MutilThread
-    {
-    private:
-        Option m_opt;
-        volatile int helpid;
-        ThreadWorkspace workspace;
-    public:
-        MutilThread(ThreadWorkspace _workspace,const Option& opt);
-        void join(std::vector<ncnn::Mat>& mats);
-        std::vector<bool> threadsComplete;
-        ~MutilThread();
-    };
-    
+class MutilThread;
+struct ThreadWorkspace
+{
+    Layer* layer;
+};
+struct ThreadInfoExc
+{
+    int threadid;
+    size_t start_index;
+    size_t end_index;
+    ThreadWorkspace* workspace;
+    std::vector<ncnn::Mat>* mats;
+    Option* opt;
+    MutilThread* manager;
+#if defined _WIN32
+    CoreInfo* coreinfo;
+#endif
+};
+class MutilThread
+{
+private:
+    Option m_opt;
+    volatile int helpid;
+    ThreadWorkspace workspace;
+
+public:
+    MutilThread(ThreadWorkspace _workspace, const Option& opt);
+    void join(std::vector<ncnn::Mat>& mats);
+    std::vector<bool> threadsComplete;
+    ~MutilThread();
+};
+
 } // namespace ncnn
 #endif

From a88e5549d2face1bfaebd2820ad7a7fac0d255f1 Mon Sep 17 00:00:00 2001
From: DC Technology <412200533@qq.com>
Date: Tue, 22 Jul 2025 16:10:05 +0800
Subject: [PATCH 08/11] surpport android and like linux class.not surport ios
 ,macos and hormonyOS

---
 src/thread.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 587c67666..1b7568f13 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -1,5 +1,9 @@
 #include "thread.h"
 #include "cpu.h"
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h>
+#endif
+
 
 #if defined _WIN32
 DWORD WINAPI winWorker(LPVOID lpParam)
@@ -23,13 +27,14 @@ DWORD WINAPI winWorker(LPVOID lpParam)
 void* pthreadWorker(void* lpParam)
 {
     ncnn::ThreadInfoExc* info = (ncnn::ThreadInfoExc*)lpParam;
+    #if defined __ANDROID__ || defined __linux__
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
     CPU_SET(info->threadid, &cpuset);
-
     // 绑定到指定核心
     pthread_t current_thread = pthread_self();
     pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
+    #endif
     info->workspace->layer->forward_thread(info);
     info->manager->threadsComplete[info->threadid] = true;
     delete info;
@@ -115,6 +120,12 @@ void MutilThread::join(std::vector<Mat>& mats)
     }
     handles.clear();
 #else
+    Mat mat = mats[0];
+    int curid = -1;
+    #if defined __ANDROID__ || defined __linux__
+    curid = sched_getcpu();
+    #endif
+
     std::vector<pthread_t> pthread_handles;
     ThreadInfoExc* curinfo = nullptr;
     size_t workersize = ((mat.w * mat.h * mat.d) / m_opt.num_threads + 1) * mat.c * mat.elemsize;
@@ -134,7 +145,7 @@ void MutilThread::join(std::vector<Mat>& mats)
         info->opt = &m_opt;
         threadsComplete[i] = false;
         info->manager = this;
-        if (cur.id == cores[i].id)
+        if (curid == cores[i].id && curid > 1)
         {
             helpid = i;
             threadsComplete[i] = true;

From 8caa9c36f895a39134643d645231855665f5c316 Mon Sep 17 00:00:00 2001
From: DaChengTechnology <12637177+DaChengTechnology@users.noreply.github.com>
Date: Tue, 22 Jul 2025 08:20:25 +0000
Subject: [PATCH 09/11] apply code-format changes

---
 src/thread.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 1b7568f13..62d60f13d 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -4,7 +4,6 @@
 #include <sched.h>
 #endif
 
-
 #if defined _WIN32
 DWORD WINAPI winWorker(LPVOID lpParam)
 {
@@ -27,14 +26,14 @@ DWORD WINAPI winWorker(LPVOID lpParam)
 void* pthreadWorker(void* lpParam)
 {
     ncnn::ThreadInfoExc* info = (ncnn::ThreadInfoExc*)lpParam;
-    #if defined __ANDROID__ || defined __linux__
+#if defined __ANDROID__ || defined __linux__
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
     CPU_SET(info->threadid, &cpuset);
     // 绑定到指定核心
     pthread_t current_thread = pthread_self();
     pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
-    #endif
+#endif
     info->workspace->layer->forward_thread(info);
     info->manager->threadsComplete[info->threadid] = true;
     delete info;
@@ -122,9 +121,9 @@ void MutilThread::join(std::vector<Mat>& mats)
 #else
     Mat mat = mats[0];
     int curid = -1;
-    #if defined __ANDROID__ || defined __linux__
+#if defined __ANDROID__ || defined __linux__
     curid = sched_getcpu();
-    #endif
+#endif
 
     std::vector<pthread_t> pthread_handles;
     ThreadInfoExc* curinfo = nullptr;

From e695869d7512c0ec325ce5fd8d51b2ba3ae56b03 Mon Sep 17 00:00:00 2001
From: DC Technology <412200533@qq.com>
Date: Tue, 29 Jul 2025 21:04:00 +0800
Subject: [PATCH 10/11] add test and fix build

---
 CMakeLists.txt        |   2 +-
 src/CMakeLists.txt    |   2 +-
 src/TheadInfo.h       |   4 +-
 src/cpu.cpp           |  20 ++++++--
 src/cpu.h             |   1 +
 src/layer.cpp         |   5 ++
 src/thread.cpp        |  12 ++---
 tests/CMakeLists.txt  |   1 +
 tests/test_thread.cpp | 112 ++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 146 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_thread.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a86c8ca5..a4ed26be1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,7 +81,7 @@ option(NCNN_SIMPLEVK "minimal in-house vulkan loader" ON)
 option(NCNN_SYSTEM_GLSLANG "use system glslang library" OFF)
 option(NCNN_RUNTIME_CPU "runtime dispatch cpu routines" ON)
 option(NCNN_DISABLE_PIC "disable position-independent code" OFF)
-option(NCNN_BUILD_TESTS "build tests" OFF)
+option(NCNN_BUILD_TESTS "build tests" ON)
 option(NCNN_COVERAGE "build for coverage" OFF)
 option(NCNN_ASAN "build for address sanitizer" OFF)
 option(NCNN_BUILD_BENCHMARK "build benchmark" ON)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 57f1fbf42..a704c0b55 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,7 +51,7 @@ endif()
 if(NCNN_MUTITHREAD)
     list(APPEND ncnn_SRCS thread.cpp)
     if(WIN32)
-        list(APPEND ncnn_SRCS ThreadInfo.cpp)
+        list(APPEND ncnn_SRCS TheadInfo.cpp)
     endif()
 endif()
 
diff --git a/src/TheadInfo.h b/src/TheadInfo.h
index 25f1b74c3..7ab03b697 100644
--- a/src/TheadInfo.h
+++ b/src/TheadInfo.h
@@ -11,12 +11,12 @@ public:
     int group;
     DWORD_PTR affinity;
 };
-class TheadInfo
+class ThreadInfo
 {
 private:
     static ThreadInfo* thread_info;
     std::vector<CoreInfo> core_infos;
-    TheadInfo(/* args */);
+    ThreadInfo(/* args */);
 
 public:
     static ThreadInfo* get();
diff --git a/src/cpu.cpp b/src/cpu.cpp
index 022981259..93680f46b 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1424,12 +1424,21 @@ static std::vector<int> get_max_freq_mhz()
 
 static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
+    #ifdef _WIN32
+        GROUP_AFFINITY groupAffinity;
+        ZeroMemory(&groupAffinity, sizeof(groupAffinity));
+        groupAffinity.Group = static_cast<WORD>(thread_affinity_mask.cpu_group);
+        groupAffinity.Mask = thread_affinity_mask.mask;
+
+        SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL);
+    #else
     DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
     if (prev_mask == 0)
     {
         NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
         return -1;
     }
+    #endif
 
     return 0;
 }
@@ -2266,22 +2275,27 @@ CpuSet::CpuSet()
 
 void CpuSet::enable(int cpu)
 {
-    mask |= ((ULONG_PTR)1 << cpu);
+    cpu_group = cpu/64;
+    mask |= ((ULONG_PTR)1 << (cpu-cpu_group*64));
 }
 
 void CpuSet::disable(int cpu)
 {
-    mask &= ~((ULONG_PTR)1 << cpu);
+    cpu_group = cpu/64;
+    mask &= ~((ULONG_PTR)1 << (cpu-cpu_group*64));
 }
 
 void CpuSet::disable_all()
 {
+    cpu_group = 0;
     mask = 0;
 }
 
 bool CpuSet::is_enabled(int cpu) const
 {
-    return mask & ((ULONG_PTR)1 << cpu);
+    if (cpu_group != cpu/64)
+        return false;
+    return mask & ((ULONG_PTR)1 << (cpu-cpu_group*64));
 }
 
 int CpuSet::num_enabled() const
diff --git a/src/cpu.h b/src/cpu.h
index cf0f8e87e..0c8761a53 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -31,6 +31,7 @@ public:
 
 public:
 #if defined _WIN32
+    int cpu_group;
     ULONG_PTR mask;
 #endif
 #if defined __ANDROID__ || defined __linux__
diff --git a/src/layer.cpp b/src/layer.cpp
index f1b849dad..4792c7231 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -98,6 +98,11 @@ int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) cons
     return -1;
 }
 
+int Layer::forward_thread(void* /*info*/) const
+{
+    return -1;
+}
+
 #if NCNN_VULKAN
 int Layer::upload_model(VkTransfer& /*cmd*/, const Option& /*opt*/)
 {
diff --git a/src/thread.cpp b/src/thread.cpp
index 1b7568f13..fa382713f 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -16,7 +16,7 @@ DWORD WINAPI winWorker(LPVOID lpParam)
         groupAffinity.Group = static_cast<WORD>(info->coreinfo->group);
         groupAffinity.Mask = info->coreinfo->affinity;
 
-        return SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL) != 0;
+        SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL);
     }
     info->workspace->layer->forward_thread(info);
     info->manager->threadsComplete[info->threadid] = true;
@@ -63,9 +63,9 @@ void MutilThread::join(std::vector<Mat>& mats)
 {
 #if defined _WIN32
     Mat mat = mats[0];
-    CoreInfo cur = TheadInfo::get()->getCurrentCore();
+    CoreInfo cur = ThreadInfo::get()->getCurrentCore();
     std::vector<CoreInfo> cores;
-    TheadInfo::get()->getAllCore(cores);
+    ThreadInfo::get()->getAllCore(cores);   
     std::vector<HANDLE> handles;
     ThreadInfoExc* curinfo = nullptr;
     size_t workersize = ((mat.w * mat.h * mat.d) / m_opt.num_threads + 1) * mat.c * mat.elemsize;
@@ -96,7 +96,7 @@ void MutilThread::join(std::vector<Mat>& mats)
         }
         handles.push_back(CreateThread(nullptr, 0, winWorker, info, 0, nullptr));
     }
-    workspace.layer->forward_inplace(curinfo);
+    workspace.layer->forward_thread(curinfo);
     delete curinfo;
     bool check = true;
     do
@@ -145,7 +145,7 @@ void MutilThread::join(std::vector<Mat>& mats)
         info->opt = &m_opt;
         threadsComplete[i] = false;
         info->manager = this;
-        if (curid == cores[i].id && curid > 1)
+        if (curid == cores[i].id && curid > -1)
         {
             helpid = i;
             threadsComplete[i] = true;
@@ -154,7 +154,7 @@ void MutilThread::join(std::vector<Mat>& mats)
         }
         pthread_handles.push_back(pthread_create(&pthread_handles[i], nullptr, pthreadWorker, info));
     }
-    workspace.layer->forward_inplace(curinfo);
+    workspace.layer->forward_thread(curinfo);
     delete curinfo;
     for (size_t i = 0; i < pthread_handles.size(); i++)
     {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9d5b6517e..25c92367c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ ncnn_add_test(c_api)
 ncnn_add_test(cpu)
 ncnn_add_test(expression)
 ncnn_add_test(paramdict)
+ncnn_add_test(thread)
 
 if(NCNN_VULKAN)
     ncnn_add_test(command)
diff --git a/tests/test_thread.cpp b/tests/test_thread.cpp
new file mode 100644
index 000000000..9b96169df
--- /dev/null
+++ b/tests/test_thread.cpp
@@ -0,0 +1,112 @@
+#include "testutil.h"
+#include "thread.h"
+
+class TestLayer : public ncnn::Layer
+{
+public:
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt)
+    {
+        ThreadWorkspace workspace;
+        workspace.layer = (Layer*)this;
+        MutilThread thread(workspace, opt);
+        std::vector<Mat> workspace_blobs;
+        workspace_blobs.push_back(bottom_top_blob);
+        thread.join(workspace_blobs);
+        return 0;
+    }
+    virtual int forward_thread(void* workspace)
+    {
+        ThreadInfoExc* info = (ThreadInfoExc*)workspace;
+        Mat& bottom_top_blob = info->mats->at(0);
+        if (bottom_top_blob.elemsize == 1)
+        {
+            int8_t* ptr = (int8_t*)bottom_top_blob.data;
+            const int8_t flag = 1 << 7;
+            for (size_t i = info->start_index; i < info->end_index; i++)
+            {
+                if (ptr[i] & flag)
+                {
+                    ptr[i] = -ptr[i];
+                }
+            }
+        }
+        else if (bottom_top_blob.elemsize == 2)
+        {
+            int16_t* ptr = (int16_t*)bottom_top_blob.data;
+            const int16_t flag = 1 << 15;
+            for (size_t i = info->start_index; i < info->end_index; i++)
+            {
+                if (ptr[i] & flag)
+                {
+                    ptr[i] = -ptr[i];
+                }
+            }
+        }
+        else
+        {
+            float* ptr = (float*)bottom_top_blob.data;
+            for (size_t i = info->start_index; i < info->end_index; i++)
+            {
+                if (ptr[i] < 0)
+                {
+                    ptr[i] = -ptr[i];
+                }
+            }
+        }
+
+        return 0;
+    }
+};
+
+static int test_thread(const ncnn::Mat& a)
+{
+    ncnn::ParamDict pd;
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer("TestLayer", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_thread failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
+    }
+
+    return ret;
+}
+
+static int test_thread_0(){
+    return 0
+          || test_thread(RandomMat(5,6,7,24))
+          || test_thread(RandomMat(5,6,7,12))
+          || test_thread(RandomMat(5,6,7,13));
+
+}
+
+static int test_thread_1(){
+    return 0
+          || test_thread(RandomMat(5,7,24))
+          || test_thread(RandomMat(5,6,24))
+          || test_thread(RandomMat(7,9,24));
+}
+
+static int test_thread_2(){
+    return 0
+          || test_thread(RandomMat(7,12))
+          || test_thread(RandomMat(5,12))
+          || test_thread(RandomMat(9,12));
+}
+
+static int test_thread_3(){
+    return 0
+          || test_thread(RandomMat(7))
+          || test_thread(RandomMat(128))
+          || test_thread(RandomMat(256));
+}
+
+int main()
+{
+    return 0 
+           || test_thread_0()
+           || test_thread_1()
+           || test_thread_2()
+           || test_thread_3();
+}
\ No newline at end of file

From 75e90b47c451170902f6829a21ad4c7674f2b968 Mon Sep 17 00:00:00 2001
From: DaChengTechnology <12637177+DaChengTechnology@users.noreply.github.com>
Date: Tue, 29 Jul 2025 13:21:50 +0000
Subject: [PATCH 11/11] apply code-format changes

---
 src/cpu.cpp           | 30 +++++++++++++++---------------
 src/thread.cpp        |  2 +-
 tests/test_thread.cpp | 39 +++++++++++++++++++++------------------
 3 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 93680f46b..f623f3b1a 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1424,21 +1424,21 @@ static std::vector<int> get_max_freq_mhz()
 
 static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
-    #ifdef _WIN32
-        GROUP_AFFINITY groupAffinity;
-        ZeroMemory(&groupAffinity, sizeof(groupAffinity));
-        groupAffinity.Group = static_cast<WORD>(thread_affinity_mask.cpu_group);
-        groupAffinity.Mask = thread_affinity_mask.mask;
-
-        SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL);
-    #else
+#ifdef _WIN32
+    GROUP_AFFINITY groupAffinity;
+    ZeroMemory(&groupAffinity, sizeof(groupAffinity));
+    groupAffinity.Group = static_cast<WORD>(thread_affinity_mask.cpu_group);
+    groupAffinity.Mask = thread_affinity_mask.mask;
+
+    SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL);
+#else
     DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
     if (prev_mask == 0)
     {
         NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
         return -1;
     }
-    #endif
+#endif
 
     return 0;
 }
@@ -2275,14 +2275,14 @@ CpuSet::CpuSet()
 
 void CpuSet::enable(int cpu)
 {
-    cpu_group = cpu/64;
-    mask |= ((ULONG_PTR)1 << (cpu-cpu_group*64));
+    cpu_group = cpu / 64;
+    mask |= ((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }
 
 void CpuSet::disable(int cpu)
 {
-    cpu_group = cpu/64;
-    mask &= ~((ULONG_PTR)1 << (cpu-cpu_group*64));
+    cpu_group = cpu / 64;
+    mask &= ~((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }
 
 void CpuSet::disable_all()
@@ -2293,9 +2293,9 @@ void CpuSet::disable_all()
 
 bool CpuSet::is_enabled(int cpu) const
 {
-    if (cpu_group != cpu/64)
+    if (cpu_group != cpu / 64)
         return false;
-    return mask & ((ULONG_PTR)1 << (cpu-cpu_group*64));
+    return mask & ((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }
 
 int CpuSet::num_enabled() const
diff --git a/src/thread.cpp b/src/thread.cpp
index e3df4ef64..779f46c87 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -64,7 +64,7 @@ void MutilThread::join(std::vector<Mat>& mats)
     Mat mat = mats[0];
     CoreInfo cur = ThreadInfo::get()->getCurrentCore();
     std::vector<CoreInfo> cores;
-    ThreadInfo::get()->getAllCore(cores);   
+    ThreadInfo::get()->getAllCore(cores);
     std::vector<HANDLE> handles;
     ThreadInfoExc* curinfo = nullptr;
     size_t workersize = ((mat.w * mat.h * mat.d) / m_opt.num_threads + 1) * mat.c * mat.elemsize;
diff --git a/tests/test_thread.cpp b/tests/test_thread.cpp
index 9b96169df..883d1e56b 100644
--- a/tests/test_thread.cpp
+++ b/tests/test_thread.cpp
@@ -73,38 +73,41 @@ static int test_thread(const ncnn::Mat& a)
     return ret;
 }
 
-static int test_thread_0(){
+static int test_thread_0()
+{
     return 0
-          || test_thread(RandomMat(5,6,7,24))
-          || test_thread(RandomMat(5,6,7,12))
-          || test_thread(RandomMat(5,6,7,13));
-
+           || test_thread(RandomMat(5, 6, 7, 24))
+           || test_thread(RandomMat(5, 6, 7, 12))
+           || test_thread(RandomMat(5, 6, 7, 13));
 }
 
-static int test_thread_1(){
+static int test_thread_1()
+{
     return 0
-          || test_thread(RandomMat(5,7,24))
-          || test_thread(RandomMat(5,6,24))
-          || test_thread(RandomMat(7,9,24));
+           || test_thread(RandomMat(5, 7, 24))
+           || test_thread(RandomMat(5, 6, 24))
+           || test_thread(RandomMat(7, 9, 24));
 }
 
-static int test_thread_2(){
+static int test_thread_2()
+{
     return 0
-          || test_thread(RandomMat(7,12))
-          || test_thread(RandomMat(5,12))
-          || test_thread(RandomMat(9,12));
+           || test_thread(RandomMat(7, 12))
+           || test_thread(RandomMat(5, 12))
+           || test_thread(RandomMat(9, 12));
 }
 
-static int test_thread_3(){
+static int test_thread_3()
+{
     return 0
-          || test_thread(RandomMat(7))
-          || test_thread(RandomMat(128))
-          || test_thread(RandomMat(256));
+           || test_thread(RandomMat(7))
+           || test_thread(RandomMat(128))
+           || test_thread(RandomMat(256));
 }
 
 int main()
 {
-    return 0 
+    return 0
            || test_thread_0()
            || test_thread_1()
            || test_thread_2()