// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #ifndef NCNN_ALLOCATOR_H #define NCNN_ALLOCATOR_H #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include #else #include #endif #include #include #include #include "platform.h" #if NCNN_VULKAN #include #include "gpu.h" #endif // NCNN_VULKAN namespace ncnn { // the alignment of all the allocated buffers #define MALLOC_ALIGN 16 // Aligns a pointer to the specified number of bytes // ptr Aligned pointer // n Alignment size that must be a power of two template static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp)) { return (_Tp*)(((size_t)ptr + n-1) & -n); } // Aligns a buffer size to the specified number of bytes // The function returns the minimum number that is greater or equal to sz and is divisible by n // sz Buffer size to align // n Alignment size that must be a power of two static inline size_t alignSize(size_t sz, int n) { return (sz + n-1) & -n; } static inline void* fastMalloc(size_t size) { #if _MSC_VER return _aligned_malloc(size, MALLOC_ALIGN); #elif _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17) void* ptr = 0; if (posix_memalign(&ptr, MALLOC_ALIGN, size)) ptr = 0; return ptr; #elif __ANDROID__ && __ANDROID_API__ < 17 return memalign(MALLOC_ALIGN, size); #else unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN); if (!udata) return 0; unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN); adata[-1] = udata; return adata; #endif } static inline void fastFree(void* ptr) { if (ptr) { #if _MSC_VER _aligned_free(ptr); #elif _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17) free(ptr); #elif __ANDROID__ && __ANDROID_API__ < 17 free(ptr); #else unsigned char* udata = ((unsigned char**)ptr)[-1]; free(udata); #endif } } // exchange-add operation for atomic operations on reference counters #if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32) // atomic increment on the linux version of the Intel(tm) compiler # define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast(reinterpret_cast(addr)), delta) #elif defined __GNUC__ # if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__) # ifdef __ATOMIC_ACQ_REL # define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL) # else # define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4) # endif # else # if defined __ATOMIC_ACQ_REL && !defined __clang__ // version for gcc >= 4.7 # define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL) # else # define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta)) # endif # endif #elif defined _MSC_VER && !defined RC_INVOKED # include # define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta) #else // thread-unsafe branch static inline int NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; } #endif #ifdef _WIN32 class Mutex { public: Mutex() { InitializeSRWLock(&srwlock); } ~Mutex() {} void lock() { AcquireSRWLockExclusive(&srwlock); } void unlock() { ReleaseSRWLockExclusive(&srwlock); } private: // NOTE SRWLock is available from windows vista SRWLOCK srwlock; }; #else // _WIN32 class Mutex { public: Mutex() { pthread_mutex_init(&mutex, 0); } ~Mutex() { pthread_mutex_destroy(&mutex); } void lock() { pthread_mutex_lock(&mutex); } void unlock() { pthread_mutex_unlock(&mutex); } private: pthread_mutex_t mutex; }; #endif // _WIN32 class MutexLockGuard { public: MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); } ~MutexLockGuard() { mutex.unlock(); } private: Mutex& mutex; }; class Allocator { public: virtual ~Allocator() = 0; virtual void* fastMalloc(size_t size) = 0; virtual void fastFree(void* ptr) = 0; }; class PoolAllocator : public Allocator { public: PoolAllocator(); ~PoolAllocator(); // ratio range 0 ~ 1 // default cr = 0.75 void set_size_compare_ratio(float scr); // release all budgets immediately void clear(); virtual void* fastMalloc(size_t size); virtual void fastFree(void* ptr); private: Mutex budgets_lock; Mutex payouts_lock; unsigned int size_compare_ratio;// 0~256 std::list< std::pair > budgets; std::list< std::pair > payouts; }; class UnlockedPoolAllocator : public Allocator { public: UnlockedPoolAllocator(); ~UnlockedPoolAllocator(); // ratio range 0 ~ 1 // default cr = 0.75 void set_size_compare_ratio(float scr); // release all budgets immediately void clear(); virtual void* fastMalloc(size_t size); virtual void fastFree(void* ptr); private: unsigned int size_compare_ratio;// 0~256 std::list< std::pair > budgets; std::list< std::pair > payouts; }; #if NCNN_VULKAN class VkBufferMemory { public: VkBuffer buffer; // the base offset assigned by allocator size_t offset; size_t capacity; VkDeviceMemory memory; void* mapped_ptr; // buffer state, modified by command functions internally // 0=null // 1=created // 2=transfer // 3=compute // 4=readonly mutable int state; // initialize and modified by mat int refcount; }; class VkAllocator { public: VkAllocator(const VulkanDevice* _vkdev); virtual ~VkAllocator() { clear(); } virtual void clear() {} virtual VkBufferMemory* fastMalloc(size_t size) = 0; virtual void fastFree(VkBufferMemory* ptr) = 0; public: const VulkanDevice* vkdev; bool mappable; protected: VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage); VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index); VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkBuffer buffer); }; class VkUnlockedBlobBufferAllocator : public VkAllocator { public: VkUnlockedBlobBufferAllocator(const VulkanDevice* vkdev); virtual ~VkUnlockedBlobBufferAllocator(); public: // buffer block size, default=16M void set_block_size(size_t size); // release all budgets immediately virtual void clear(); virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); private: size_t block_size; size_t buffer_offset_alignment; std::vector< std::list< std::pair > > budgets; std::vector buffer_blocks; }; class VkBlobBufferAllocator : public VkUnlockedBlobBufferAllocator { public: VkBlobBufferAllocator(const VulkanDevice* vkdev); virtual ~VkBlobBufferAllocator(); public: virtual void clear(); virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); private: Mutex budgets_lock; }; class VkWeightBufferAllocator : public VkAllocator { public: VkWeightBufferAllocator(const VulkanDevice* vkdev); virtual ~VkWeightBufferAllocator(); public: // buffer block size, default=8M void set_block_size(size_t block_size); // release all blocks immediately virtual void clear(); public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); private: size_t block_size; size_t buffer_offset_alignment; std::vector buffer_block_free_spaces; std::vector buffer_blocks; std::vector dedicated_buffer_blocks; }; class VkUnlockedStagingBufferAllocator : public VkAllocator { public: VkUnlockedStagingBufferAllocator(const VulkanDevice* vkdev); virtual ~VkUnlockedStagingBufferAllocator(); public: // ratio range 0 ~ 1 // default cr = 0.75 void set_size_compare_ratio(float scr); // release all budgets immediately virtual void clear(); virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); private: uint32_t memory_type_index; unsigned int size_compare_ratio;// 0~256 std::list budgets; }; class VkStagingBufferAllocator : public VkUnlockedStagingBufferAllocator { public: VkStagingBufferAllocator(const VulkanDevice* vkdev); virtual ~VkStagingBufferAllocator(); public: virtual void clear(); virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); private: Mutex budgets_lock; }; class VkWeightStagingBufferAllocator : public VkAllocator { public: VkWeightStagingBufferAllocator(const VulkanDevice* vkdev); virtual ~VkWeightStagingBufferAllocator(); public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); private: uint32_t memory_type_index; }; #endif // NCNN_VULKAN } // namespace ncnn #endif // NCNN_ALLOCATOR_H