Browse Source

vkmat and command api breaks (#1689)

* vkmat and command api breaks

* always use compute queue for compute buffer transfer

* no barrier for readonly weight buffer

* record clone, drop queue_owner

* bring back layer forward

* fix validation errors

* lifecycle inside command makes life easier

* update doc

* record_import_android_hardware_buffer
tags/20200413
nihui GitHub 6 years ago
parent
commit
7365bb80a2
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 1771 additions and 1799 deletions
  1. +24
    -47
      docs/developer-guide/low-level-operation-api.md
  2. +18
    -9
      src/allocator.cpp
  3. +5
    -13
      src/allocator.h
  4. +1387
    -1120
      src/command.cpp
  5. +78
    -152
      src/command.h
  6. +2
    -0
      src/gpu.cpp
  7. +3
    -0
      src/gpu.h
  8. +2
    -10
      src/layer.cpp
  9. +4
    -4
      src/layer/vulkan/binaryop_vulkan.cpp
  10. +3
    -3
      src/layer/vulkan/cast_vulkan.cpp
  11. +9
    -9
      src/layer/vulkan/concat_vulkan.cpp
  12. +13
    -19
      src/layer/vulkan/convolution_vulkan.cpp
  13. +4
    -6
      src/layer/vulkan/convolutiondepthwise_vulkan.cpp
  14. +2
    -2
      src/layer/vulkan/crop_vulkan.cpp
  15. +3
    -4
      src/layer/vulkan/deconvolution_vulkan.cpp
  16. +5
    -7
      src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
  17. +1
    -1
      src/layer/vulkan/deepcopy_vulkan.cpp
  18. +1
    -1
      src/layer/vulkan/eltwise_vulkan.cpp
  19. +1
    -1
      src/layer/vulkan/flatten_vulkan.cpp
  20. +1
    -1
      src/layer/vulkan/innerproduct_vulkan.cpp
  21. +7
    -7
      src/layer/vulkan/instancenorm_vulkan.cpp
  22. +5
    -5
      src/layer/vulkan/interp_vulkan.cpp
  23. +2
    -2
      src/layer/vulkan/lrn_vulkan.cpp
  24. +3
    -3
      src/layer/vulkan/normalize_vulkan.cpp
  25. +3
    -3
      src/layer/vulkan/packing_vulkan.cpp
  26. +2
    -2
      src/layer/vulkan/padding_vulkan.cpp
  27. +2
    -2
      src/layer/vulkan/permute_vulkan.cpp
  28. +1
    -1
      src/layer/vulkan/pixelshuffle_vulkan.cpp
  29. +6
    -11
      src/layer/vulkan/pooling_vulkan.cpp
  30. +2
    -2
      src/layer/vulkan/priorbox_vulkan.cpp
  31. +1
    -1
      src/layer/vulkan/reorg_vulkan.cpp
  32. +3
    -3
      src/layer/vulkan/reshape_vulkan.cpp
  33. +1
    -1
      src/layer/vulkan/shufflechannel_vulkan.cpp
  34. +6
    -6
      src/layer/vulkan/slice_vulkan.cpp
  35. +12
    -12
      src/layer/vulkan/softmax_vulkan.cpp
  36. +82
    -176
      src/mat.h
  37. +23
    -64
      src/net.cpp
  38. +1
    -1
      src/pipeline.cpp
  39. +16
    -34
      tests/test_cast.cpp
  40. +27
    -54
      tests/testutil.h

+ 24
- 47
docs/developer-guide/low-level-operation-api.md View File

@@ -150,6 +150,17 @@ ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(vkdev);
ncnn::Layer* convolution = ncnn::create_layer("Convolution");
convolution->vkdev = vkdev;

// set option
ncnn::Option opt;
opt.lightmode = true;
opt.num_threads = 4;
opt.blob_allocator = 0;
opt.workspace_allocator = 0;
opt.vulkan_compute = true;
opt.blob_vkallocator = &g_blob_vkallocator;
opt.workspace_vkallocator = &g_blob_vkallocator;
opt.staging_vkallocator = &g_staging_vkallocator;

// load param
{
ncnn::ParamDict pd;
@@ -171,76 +182,42 @@ ncnn::ModelBinFromMatArray mb(weights);
convolution->load_model(mb);
}

// upload model
{
ncnn::VkTransfer cmd(vkdev);
cmd.weight_vkallocator = &g_weight_vkallocator;
cmd.staging_vkallocator = &g_weight_staging_vkallocator;

convolution->upload_model(cmd);

cmd.submit();
cmd.wait();

g_weight_staging_vkallocator.clear();
}

// create pipeline
convolution->create_pipeline(opt);

// set default option
// upload model
{
ncnn::Option opt = ncnn::get_default_option();
ncnn::VkTransfer cmd(vkdev);

opt.lightmode = true;
opt.num_threads = 4;
opt.blob_allocator = 0;
opt.workspace_allocator = 0;
ncnn::Option opt_upload = opt;
opt_upload.blob_vkallocator = &g_weight_vkallocator;
opt_upload.workspace_vkallocator = &g_weight_vkallocator;
opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;

opt.vulkan_compute = true;
opt.blob_vkallocator = &g_blob_vkallocator;
opt.workspace_vkallocator = &g_blob_vkallocator;
opt.staging_vkallocator = &g_staging_vkallocator;
convolution->upload_model(cmd, opt_upload);

ncnn::set_default_option(opt);
cmd.submit_and_wait();
}

ncnn::Mat bottom = random_mat(w, h, inch);

ncnn::VkMat bottom_gpu;

// copy bottom to bottom_gpu
{
bottom_gpu.create_like(bottom, &g_blob_vkallocator, &g_staging_vkallocator);
bottom_gpu.prepare_staging_buffer();
bottom_gpu.upload(bottom);
}

ncnn::VkMat top_gpu;
ncnn::Mat top;

// forward
{
ncnn::VkCompute cmd(vkdev);

cmd.record_upload(bottom_gpu);
ncnn::VkMat bottom_gpu;
cmd.record_upload(bottom, bottom_gpu, opt);

ncnn::VkMat top_gpu;
convolution->forward(bottom_gpu, top_gpu, cmd, opt);

top_gpu.prepare_staging_buffer();

cmd.record_download(top_gpu);
cmd.record_download(top_gpu, top, opt);

cmd.submit_and_wait();
}

ncnn::Mat top;

// copy top_gpu to top
{
top.create_like(top_gpu);
top_gpu.download(top);
}

convolution->destroy_pipeline(opt);

delete convolution;


+ 18
- 9
src/allocator.cpp View File

@@ -470,7 +470,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size)
ptr->memory = buffer_blocks[i]->memory;
ptr->capacity = aligned_size;
ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr;
ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

// adjust budgets
if (budget_size == aligned_size)
@@ -540,7 +541,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size)
ptr->memory = block->memory;
ptr->capacity = aligned_size;
ptr->mapped_ptr = block->mapped_ptr;
ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

// adjust budgets
std::list< std::pair<size_t, size_t> > budget;
@@ -715,7 +717,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
ptr->memory = buffer_blocks[block_index]->memory;
ptr->capacity = aligned_size;
ptr->mapped_ptr = buffer_blocks[block_index]->mapped_ptr;
ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

buffer_block_free_spaces[block_index] -= aligned_size;

@@ -790,7 +793,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
ptr->memory = block->memory;
ptr->capacity = new_block_size;
ptr->mapped_ptr = block->mapped_ptr;
ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

return ptr;
}
@@ -841,7 +845,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
ptr->memory = block->memory;
ptr->capacity = aligned_size;
ptr->mapped_ptr = block->mapped_ptr;
ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

return ptr;
}
@@ -940,7 +945,8 @@ VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size)

vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);

ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

// fprintf(stderr, "VkStagingBufferAllocator M %p %lu\n", ptr->buffer, size);

@@ -989,7 +995,8 @@ VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size)

vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);

ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

// fprintf(stderr, "VkWeightStagingBufferAllocator M %p %lu\n", ptr->buffer, size);

@@ -1137,7 +1144,8 @@ VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkForma

ptr->imageview = create_imageview(ptr->image, format);

ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

return ptr;
}
@@ -1290,7 +1298,8 @@ VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*width*/,
ptr->image = image;
ptr->memory = memory;
ptr->imageview = imageview;
ptr->state = 1;
ptr->access_flags = 0;
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

return ptr;
}


+ 5
- 13
src/allocator.h View File

@@ -192,12 +192,8 @@ public:
void* mapped_ptr;

// buffer state, modified by command functions internally
// 0=null
// 1=created
// 2=transfer
// 3=compute
// 4=readonly
mutable int state;
mutable VkAccessFlags access_flags;
mutable VkPipelineStageFlags stage_flags;

// initialize and modified by mat
int refcount;
@@ -311,13 +307,9 @@ public:

VkDeviceMemory memory;

// buffer state, modified by command functions internally
// 0=null
// 1=created
// 2=transfer
// 3=compute
// 4=readonly
mutable int state;
// image state, modified by command functions internally
mutable VkAccessFlags access_flags;
mutable VkPipelineStageFlags stage_flags;

// initialize and modified by mat
int refcount;


+ 1387
- 1120
src/command.cpp
File diff suppressed because it is too large
View File


+ 78
- 152
src/command.h View File

@@ -1,6 +1,6 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
@@ -22,61 +22,31 @@
#include <vector>
#include <vulkan/vulkan.h>
#include "mat.h"
#include "pipeline.h"

namespace ncnn {

class Command
{
public:
Command(const VulkanDevice* vkdev, uint32_t queue_family_index);
virtual ~Command();

protected:
int create_command_pool();
int create_command_buffer();

// record issue
int begin_command_buffer();
int end_command_buffer();
int queue_submit_and_wait_fence();

protected:
const VulkanDevice* vkdev;
uint32_t queue_family_index;

VkCommandPool command_pool;
VkCommandBuffer command_buffer;

VkFence fence;
};

class VkCompute : public Command
class Pipeline;
class VkCompute
{
public:
VkCompute(const VulkanDevice* vkdev);
~VkCompute();

void record_upload(const VkMat& m);

void record_download(const VkMat& m);
virtual ~VkCompute();

void record_clone(const VkMat& src, const VkMat& dst);
public:
void record_upload(const Mat& src, VkMat& dst, const Option& opt);

void record_copy_region(const VkMat& src, const VkMat& dst, const VkBufferCopy& region);
void record_download(const VkMat& src, Mat& dst, const Option& opt);

void record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector<VkBufferCopy>& regions);
void record_clone(const VkMat& src, VkMat& dst, const Option& opt);

void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& m);
void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);

#if NCNN_BENCHMARK
void record_write_timestamp(uint32_t query);
#endif // NCNN_BENCHMARK

void record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index);

#if __ANDROID_API__ >= 26
void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m);
void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
#endif // __ANDROID_API__ >= 26

int submit_and_wait();
@@ -90,116 +60,73 @@ public:
#endif // NCNN_BENCHMARK

protected:
// record pipeline things
void record_bind_pipeline(VkPipeline pipeline);
void record_update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector<VkMat>& bindings);
void record_push_constants(VkPipelineLayout pipeline_layout, const std::vector<vk_constant_type>& constants);
void record_dispatch(const uint32_t* group_count_xyz);

// record barrier things
void record_transfer_compute_barrier(const VkMat& m);
void record_compute_transfer_barrier(const VkMat& m);
void record_compute_compute_barrier(const VkMat& m);
void record_transfer_transfer_barrier(const VkMat& m);
void record_host_transfer_barrier(const VkMat& m);
void record_transfer_host_barrier(const VkMat& m);
void record_host_compute_barrier(const VkMat& m);
void record_compute_host_barrier(const VkMat& m);

// record prepare things
void record_prepare_transfer_barrier(const VkMat& m);
void record_prepare_compute_barrier(const VkMat& m);
void record_prepare_host_barrier(const VkMat& m);

void record_initial_image_compute_barrier(const VkImageMat& im);
int init();
int begin_command_buffer();
int end_command_buffer();

#if __ANDROID_API__ >= 26
void record_update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, VkSampler sampler, const VkImageMat& im, const VkMat& m);
#endif // __ANDROID_API__ >= 26
protected:
const VulkanDevice* vkdev;

#if NCNN_BENCHMARK
void reset_query_pool();
#endif // NCNN_BENCHMARK
VkCommandPool compute_command_pool;

protected:
// recording issue
void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size);
void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions);
void bind_pipeline(VkPipeline pipeline);
void bind_descriptorset(VkPipelineLayout pipeline_layout, VkDescriptorSet descriptorset);
void update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector<VkDescriptorBufferInfo>& descriptorBufferInfos);
void push_constants(VkPipelineLayout pipeline_layout, const std::vector<vk_constant_type>& constants);
void dispatch(const uint32_t* group_count_xyz);
void transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
void compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size);
void compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
void transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size);
void host_transfer_barrier(VkBuffer buffer, size_t offset, size_t size);
void transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size);
void host_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
void compute_host_barrier(VkBuffer buffer, size_t offset, size_t size);
void queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index);
void initial_image_compute_barrier(VkImage image);
#if __ANDROID_API__ >= 26
void update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo);
#endif // __ANDROID_API__ >= 26
#if NCNN_BENCHMARK
void write_timestamp(uint32_t query);
#endif // NCNN_BENCHMARK
VkCommandBuffer compute_command_buffer;

VkFence compute_command_fence;

std::vector<VkMat> upload_staging_buffers;
std::vector<VkMat> download_post_buffers;
std::vector<Mat> download_post_mats;

protected:
// delayed record
// the good-old path for device without VK_KHR_push_descriptor
std::vector<VkDescriptorPool> descriptor_pools;
std::vector<VkDescriptorSet> descriptorsets;
struct record_type

struct record
{
// 0=copy
// 1=copy regions
// 2=bind pipeline
// 3=bind descriptorset
// 4=push constants
// 5=dispatch
// 6=transfer-compute barrier
// 7=compute-transfer barrier
// 8=compute-compute barrier
// 9=transfer-transfer barrier
// 10=write timestamp
// 11=initial image compute barrier
// 12=host-transfer barrier
// 13=transfer-host barrier
// 14=host-compute barrier
// 15=compute-host barrier
// 16=queue-transfer-acquire barrier
enum
{
TYPE_copy_buffer,
TYPE_bind_pipeline,
TYPE_bind_descriptorsets,
TYPE_push_constants,
TYPE_dispatch,
TYPE_memory_barrers,
TYPE_buffer_barrers,
TYPE_image_barrers,

#if NCNN_BENCHMARK
TYPE_write_timestamp,
#endif // NCNN_BENCHMARK

TYPE_post_download,
};

int type;
VkCommandBuffer command_buffer;

union
{
struct { VkBuffer src; size_t src_offset; VkBuffer dst; size_t dst_offset; size_t size; } copy;
struct { VkBuffer src; VkBuffer dst; } copy_regions;
struct { VkPipeline pipeline; } bind_pipeline;
struct { VkPipelineLayout pipeline_layout; VkDescriptorSet descriptorset; } bind_descriptorset;
struct { VkPipelineLayout pipeline_layout; } push_constants;
struct { uint32_t group_count_xyz[3]; } dispatch;
struct { VkBuffer buffer; size_t offset; size_t size; } transfer_compute_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } compute_transfer_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } compute_compute_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } transfer_transfer_barrier;
struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer;

struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline;
struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets;
struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants;

struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch;

struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers;
struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers;
struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers;

#if NCNN_BENCHMARK
struct { uint32_t query; } write_timestamp;
#endif // NCNN_BENCHMARK
struct { VkImage image; } initial_image_compute_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } host_transfer_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } transfer_host_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } host_compute_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } compute_host_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; size_t src_queue_family_index; } queue_transfer_acquire_barrier;
};

std::vector<VkBufferCopy> regions;
std::vector<vk_constant_type> constants;
struct { uint32_t download_post_buffer_mat_offset; } post_download;
};
};
std::vector<record_type> delayed_records;

std::vector<record> delayed_records;

#if NCNN_BENCHMARK
uint32_t query_count;
@@ -207,38 +134,37 @@ protected:
#endif // NCNN_BENCHMARK
};

class VkTransfer : public Command
class VkTransfer
{
public:
VkTransfer(const VulkanDevice* vkdev);
~VkTransfer();

public:
void record_upload(const Mat& src, VkMat& dst, const Option& opt);

int submit_and_wait();

public:
VkAllocator* weight_vkallocator;
VkAllocator* staging_vkallocator;

protected:
// recording issue
void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size);
void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions);
void queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t target_queue_family_index);
int init();
int begin_command_buffer();
int end_command_buffer();

protected:
size_t buffer_offset_alignment;
VkBufferMemory* staging_data;
const VulkanDevice* vkdev;

// delayed record
struct record_type
{
size_t size;
Mat mat;
VkMat vkmat;
};
std::vector<record_type> delayed_records;
VkCommandPool compute_command_pool;
VkCommandPool transfer_command_pool;

VkCommandBuffer upload_command_buffer;
VkCommandBuffer compute_command_buffer;

VkSemaphore upload_compute_semaphore;

VkFence upload_command_fence;
VkFence compute_command_fence;

std::vector<VkMat> upload_staging_buffers;
};

} // namespace ncnn


+ 2
- 0
src/gpu.cpp View File

@@ -628,6 +628,8 @@ int create_gpu_instance()
gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount;
gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount;

gpu_info.unified_compute_transfer_queue = gpu_info.compute_queue_family_index == gpu_info.transfer_queue_family_index;

// cache memory properties
vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physicalDeviceMemoryProperties);



+ 3
- 0
src/gpu.h View File

@@ -111,6 +111,9 @@ public:
uint32_t graphics_queue_count;
uint32_t transfer_queue_count;

// property
bool unified_compute_transfer_queue;

// bug is not feature
bool bug_local_size_spec_const;



+ 2
- 10
src/layer.cpp View File

@@ -121,11 +121,7 @@ int Layer::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& t
top_blobs.resize(bottom_blobs.size());
for (int i = 0; i < (int)top_blobs.size(); i++)
{
top_blobs[i].create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);
if (top_blobs[i].empty())
return -100;

cmd.record_clone(bottom_blobs[i], top_blobs[i]);
cmd.record_clone(bottom_blobs[i], top_blobs[i], opt);
}

return forward_inplace(top_blobs, cmd, opt);
@@ -136,11 +132,7 @@ int Layer::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, co
if (!support_inplace)
return -1;

top_blob.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);
if (top_blob.empty())
return -100;

cmd.record_clone(bottom_blob, top_blob);
cmd.record_clone(bottom_blob, top_blob, opt);

return forward_inplace(top_blob, cmd, opt);
}


+ 4
- 4
src/layer/vulkan/binaryop_vulkan.cpp View File

@@ -317,21 +317,21 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
// broadcast
if (bottom_blob.dims > bottom_blob1.dims)
{
top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create_like(bottom_blob, opt.blob_vkallocator);
}
else if (bottom_blob.dims < bottom_blob1.dims)
{
top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create_like(bottom_blob1, opt.blob_vkallocator);
}
else // if (bottom_blob.dims == bottom_blob1.dims)
{
if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack)
{
top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create_like(bottom_blob, opt.blob_vkallocator);
}
else
{
top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create_like(bottom_blob1, opt.blob_vkallocator);
}
}
if (top_blob.empty())


+ 3
- 3
src/layer/vulkan/cast_vulkan.cpp View File

@@ -234,15 +234,15 @@ int Cast_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c

if (dims == 1)
{
top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
}
else if (dims == 2)
{
top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
}
else if (dims == 3)
{
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
}
if (top_blob.empty())
return -100;


+ 9
- 9
src/layer/vulkan/concat_vulkan.cpp View File

@@ -312,14 +312,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
}

VkMat& top_blob = top_blobs[0];
top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

VkMat top_blob_unpacked = top_blob;
if (elempack < out_elempack)
{
top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}
@@ -415,14 +415,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
}

VkMat& top_blob = top_blobs[0];
top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

VkMat top_blob_unpacked = top_blob;
if (elempack < out_elempack)
{
top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}
@@ -506,7 +506,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
}

VkMat& top_blob = top_blobs[0];
top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -573,14 +573,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
}

VkMat& top_blob = top_blobs[0];
top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

VkMat top_blob_unpacked = top_blob;
if (elempack < out_elempack)
{
top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}
@@ -665,7 +665,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
}

VkMat& top_blob = top_blobs[0];
top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -720,7 +720,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
}

VkMat& top_blob = top_blobs[0];
top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 13
- 19
src/layer/vulkan/convolution_vulkan.cpp View File

@@ -1010,8 +1010,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad / 2;
@@ -1037,8 +1036,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad - hpad / 2;
@@ -1089,8 +1087,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = 0;
@@ -1110,7 +1107,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// transform input
VkMat bottom_tm_blob;
{
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

@@ -1138,7 +1135,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// gemm
VkMat top_tm_blob;
{
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

@@ -1165,7 +1162,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// transform output
VkMat top_blob_bordered;
{
top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob_bordered.empty())
return -100;

@@ -1193,8 +1190,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom

// crop top_blob
{
VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
crop_param_blob.prepare_staging_buffer();
VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

crop_params[0] = 0;
@@ -1232,8 +1228,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = 0;
@@ -1253,7 +1248,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// transform input
VkMat bottom_tm_blob;
{
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

@@ -1281,7 +1276,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// gemm
VkMat top_tm_blob;
{
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

@@ -1308,7 +1303,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// transform output
VkMat top_blob_bordered;
{
top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob_bordered.empty())
return -100;

@@ -1336,8 +1331,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom

// crop top_blob
{
VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
crop_param_blob.prepare_staging_buffer();
VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

crop_params[0] = 0;
@@ -1360,7 +1354,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
}


top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 4
- 6
src/layer/vulkan/convolutiondepthwise_vulkan.cpp View File

@@ -534,8 +534,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad / 2;
@@ -561,8 +560,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad - hpad / 2;
@@ -595,7 +593,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -656,7 +654,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
VkMat top_blob_unpacked = top_blob;
if (out_elempack_g < out_elempack)
{
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator, opt.staging_vkallocator);
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}


+ 2
- 2
src/layer/vulkan/crop_vulkan.cpp View File

@@ -381,7 +381,7 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c
packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
}

top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -515,7 +515,7 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM

VkMat& top_blob = top_blobs[0];

top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 3
- 4
src/layer/vulkan/deconvolution_vulkan.cpp View File

@@ -414,11 +414,11 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
VkMat top_blob_bordered;
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
}
else
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
}
if (top_blob_bordered.empty())
return -100;
@@ -528,8 +528,7 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
int wcut = top_blob_bordered_adj.w - output_w;
int hcut = top_blob_bordered_adj.h - output_h;

VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
crop_param_blob.prepare_staging_buffer();
VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)


+ 5
- 7
src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp View File

@@ -597,11 +597,11 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
VkMat top_blob_bordered;
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
}
else
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
}
if (top_blob_bordered.empty())
return -100;
@@ -681,8 +681,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
int wcut = top_blob_bordered_adj.w - output_w;
int hcut = top_blob_bordered_adj.h - output_h;

VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
crop_param_blob.prepare_staging_buffer();
VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
@@ -763,7 +762,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
VkMat top_blob_unpacked = top_blob_bordered;
if (out_elempack_g < out_elempack)
{
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator, opt.staging_vkallocator);
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}
@@ -883,8 +882,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
int wcut = top_blob_bordered_adj.w - output_w;
int hcut = top_blob_bordered_adj.h - output_h;

VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
crop_param_blob.prepare_staging_buffer();
VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)


+ 1
- 1
src/layer/vulkan/deepcopy_vulkan.cpp View File

@@ -144,7 +144,7 @@ int DeepCopy_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkComput
{
int elempack = bottom_blob.elempack;

top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create_like(bottom_blob, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 1
- 1
src/layer/vulkan/eltwise_vulkan.cpp View File

@@ -157,7 +157,7 @@ int Eltwise_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<
int elempack = bottom_blob.elempack;

VkMat& top_blob = top_blobs[0];
top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 1
- 1
src/layer/vulkan/flatten_vulkan.cpp View File

@@ -205,7 +205,7 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
return 0;
}

top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 1
- 1
src/layer/vulkan/innerproduct_vulkan.cpp View File

@@ -306,7 +306,7 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 7
- 7
src/layer/vulkan/instancenorm_vulkan.cpp View File

@@ -380,7 +380,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
int elempack = bottom_top_blob.elempack;

// mean
VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator);
{
// reduce sum
VkMat sum_workspace;
@@ -389,7 +389,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
int reduced_h = 1;
int reduced_c = bottom_top_blob.c;

sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);
{
std::vector<VkMat> bindings(2);
bindings[0] = bottom_top_blob;
@@ -419,7 +419,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
int reduced_c = sum_workspace.c;

VkMat sum_workspace_reduced;
sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);

{
std::vector<VkMat> bindings(2);
@@ -466,11 +466,11 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
}

// var
VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator);
{
// sub mean and square
VkMat square_workspace;
square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator);
{
std::vector<VkMat> bindings(3);
bindings[0] = bottom_top_blob;
@@ -509,7 +509,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
int reduced_c = sqsum_workspace.c;

VkMat sqsum_workspace_reduced;
sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);

{
std::vector<VkMat> bindings(2);
@@ -557,7 +557,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,

// coeffs
VkMat coeffs_workspace;
coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator, opt.staging_vkallocator);
coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator);
{
std::vector<VkMat> bindings(5);
bindings[0] = coeffs_workspace;


+ 5
- 5
src/layer/vulkan/interp_vulkan.cpp View File

@@ -274,7 +274,7 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
return 0;
}

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -306,11 +306,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
}
else if (resize_type == 3) // bicubic
{
VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator);
VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator);
if (alpha.empty())
return -100;

VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator);
if (xofs.empty())
return -100;

@@ -328,11 +328,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
cmd.record_pipeline(pipeline_interp_bicubic_coeffs_x, bindings, constants, alpha);
}

VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator);
VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator);
if (beta.empty())
return -100;

VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator);
if (yofs.empty())
return -100;



+ 2
- 2
src/layer/vulkan/lrn_vulkan.cpp View File

@@ -254,11 +254,11 @@ int LRN_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Op
if (region_type == NormRegion_ACROSS_CHANNELS)
{
// always create scalar square workspace blob for norm across channel
square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator);
}
else if (region_type == NormRegion_WITHIN_CHANNEL)
{
square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator);
}

// square pad


+ 3
- 3
src/layer/vulkan/normalize_vulkan.cpp View File

@@ -298,7 +298,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
reduced_c = (bottom_top_blob.c + 3) / 4;
}

sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);
{
std::vector<VkMat> bindings(2);
bindings[0] = bottom_top_blob;
@@ -347,7 +347,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
}

VkMat sqsum_workspace_reduced;
sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);

{
std::vector<VkMat> bindings(2);
@@ -377,7 +377,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co

// coeffs
VkMat coeffs_workspace;
coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator);
{
std::vector<VkMat> bindings(2);
bindings[0] = sqsum_workspace;


+ 3
- 3
src/layer/vulkan/packing_vulkan.cpp View File

@@ -203,7 +203,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}
@@ -219,7 +219,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}
@@ -235,7 +235,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}


+ 2
- 2
src/layer/vulkan/padding_vulkan.cpp View File

@@ -170,7 +170,7 @@ int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
int outw = w + left + right;
int outh = h + top + bottom;

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -239,7 +239,7 @@ int Padding_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<
int outw = w + _left + _right;
int outh = h + _top + _bottom;

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 2
- 2
src/layer/vulkan/permute_vulkan.cpp View File

@@ -270,7 +270,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}
@@ -329,7 +329,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}


+ 1
- 1
src/layer/vulkan/pixelshuffle_vulkan.cpp View File

@@ -200,7 +200,7 @@ int PixelShuffle_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 6
- 11
src/layer/vulkan/pooling_vulkan.cpp View File

@@ -287,7 +287,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute

if (global_pooling)
{
top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -295,7 +295,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(12);
std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
@@ -306,8 +306,6 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
constants[10].i = 0;
constants[11].i = 0;

const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8
: elempack == 4 ? pipeline_pooling_global_pack4
@@ -336,8 +334,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = pad_top;
@@ -369,8 +366,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad / 2;
@@ -396,8 +392,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad - hpad / 2;
@@ -421,7 +416,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
int outw = (w - kernel_w) / stride_w + 1;
int outh = (h - kernel_h) / stride_h + 1;

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 2
- 2
src/layer/vulkan/priorbox_vulkan.cpp View File

@@ -163,7 +163,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
}

VkMat& top_blob = top_blobs[0];
top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -217,7 +217,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
}

VkMat& top_blob = top_blobs[0];
top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 1
- 1
src/layer/vulkan/reorg_vulkan.cpp View File

@@ -192,7 +192,7 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 3
- 3
src/layer/vulkan/reshape_vulkan.cpp View File

@@ -275,7 +275,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
return 0;
}

top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
}
else if (ndim == 2)
{
@@ -308,7 +308,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
return 0;
}

top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
}
else // if (ndim == 3)
{
@@ -348,7 +348,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
return 0;
}

top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
}

if (top_blob.empty())


+ 1
- 1
src/layer/vulkan/shufflechannel_vulkan.cpp View File

@@ -142,7 +142,7 @@ int ShuffleChannel_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, Vk
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 6
- 6
src/layer/vulkan/slice_vulkan.cpp View File

@@ -314,7 +314,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
}

VkMat& top_blob = top_blobs[i];
top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -416,7 +416,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
}

VkMat& top_blob = top_blobs[i];
top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -508,7 +508,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
}

VkMat& top_blob = top_blobs[i];
top_blob.create(slice, h, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(slice, h, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -576,7 +576,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
}

VkMat& top_blob = top_blobs[i];
top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -669,7 +669,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
}

VkMat& top_blob = top_blobs[i];
top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

@@ -727,7 +727,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
}

VkMat& top_blob = top_blobs[i];
top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;



+ 12
- 12
src/layer/vulkan/softmax_vulkan.cpp View File

@@ -283,33 +283,33 @@ int Softmax_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, cons

if (dims == 1) // axis == 0
{
max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator);
sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator);
}
else if (dims == 2 && axis == 0)
{
max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator);
sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator);
}
else if (dims == 2 && axis == 1)
{
max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator);
sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator);
}
else if (dims == 3 && axis == 0)
{
max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator);
sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator);
}
else if (dims == 3 && axis == 1)
{
max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator);
sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator);
}
else if (dims == 3 && axis == 2)
{
max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator);
sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator);
}

// reduce max


+ 82
- 176
src/mat.h View File

@@ -261,59 +261,51 @@ public:
// empty
VkMat();
// vec
VkMat(int w, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, size_t elemsize, VkAllocator* allocator);
// image
VkMat(int w, int h, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
// dim
VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
// packed vec
VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
// packed image
VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
// packed dim
VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
// copy
VkMat(const VkMat& m);
// external vec
VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
// external image
VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
// external dim
VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
// external packed vec
VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
// external packed image
VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
// external packed dim
VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
// release
~VkMat();
// assign
VkMat& operator=(const VkMat& m);
// allocate vec
void create(int w, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
void create(int w, size_t elemsize, VkAllocator* allocator);
// allocate image
void create(int w, int h, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
void create(int w, int h, size_t elemsize, VkAllocator* allocator);
// allocate dim
void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
// allocate packed vec
void create(int w, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
// allocate packed image
void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
// allocate packed dim
void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
// allocate like
void create_like(const Mat& m, VkAllocator* allocator, VkAllocator* staging_allocator);
void create_like(const Mat& m, VkAllocator* allocator);
// allocate like
void create_like(const VkMat& m, VkAllocator* allocator, VkAllocator* staging_allocator);

// staging buffer
void prepare_staging_buffer();
void discard_staging_buffer();

// copy
void upload(const Mat& m);
void download(Mat& m) const;
void create_like(const VkMat& m, VkAllocator* allocator);

// mapped
Mat mapped() const;
@@ -333,19 +325,14 @@ public:
// low-level reference
VkBuffer buffer() const;
size_t buffer_offset() const;
VkBuffer staging_buffer() const;
size_t staging_buffer_offset() const;
size_t buffer_capacity() const;

// device buffer
VkBufferMemory* data;

// staging buffer
VkBufferMemory* staging_data;

// pointer to the reference counter
// when points to user-allocated data, the pointer is NULL
int* refcount;
int* staging_refcount;

// element size in bytes
// 4 = float32/int32
@@ -362,7 +349,6 @@ public:

// the allocator
VkAllocator* allocator;
VkAllocator* staging_allocator;

// the dimension rank
int dims;
@@ -1234,90 +1220,87 @@ inline const float& Mat::operator[](size_t i) const
#if NCNN_VULKAN

inline VkMat::VkMat()
: data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
: data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
{
}

inline VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
inline VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
: data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
{
create(_w, _elemsize, _allocator, _staging_allocator);
create(_w, _elemsize, _allocator);
}

inline VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
inline VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
: data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
{
create(_w, _h, _elemsize, _allocator, _staging_allocator);
create(_w, _h, _elemsize, _allocator);
}

inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
: data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
{
create(_w, _h, _c, _elemsize, _allocator, _staging_allocator);
create(_w, _h, _c, _elemsize, _allocator);
}

inline VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
inline VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
: data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
{
create(_w, _elemsize, _elempack, _allocator, _staging_allocator);
create(_w, _elemsize, _elempack, _allocator);
}

inline VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
inline VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
: data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
{
create(_w, _h, _elemsize, _elempack, _allocator, _staging_allocator);
create(_w, _h, _elemsize, _elempack, _allocator);
}

inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
: data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
{
create(_w, _h, _c, _elemsize, _elempack, _allocator, _staging_allocator);
create(_w, _h, _c, _elemsize, _elempack, _allocator);
}

inline VkMat::VkMat(const VkMat& m)
: data(m.data), staging_data(m.staging_data), refcount(m.refcount), staging_refcount(m.staging_refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), staging_allocator(m.staging_allocator), dims(m.dims), w(m.w), h(m.h), c(m.c)
: data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c)
{
if (refcount)
NCNN_XADD(refcount, 1);

if (staging_refcount)
NCNN_XADD(staging_refcount, 1);

cstep = m.cstep;
}

inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(1), w(_w), h(1), c(1)
inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
: data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1)
{
cstep = w;
}

inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(2), w(_w), h(_h), c(1)
inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
: data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
{
cstep = w * h;
}

inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(3), w(_w), h(_h), c(_c)
inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
: data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
{
cstep = alignSize(w * h * elemsize, 16) / elemsize;
}

inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(1), w(_w), h(1), c(1)
inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
: data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1)
{
cstep = w;
}

inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(2), w(_w), h(_h), c(1)
inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
: data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
{
cstep = w * h;
}

inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
: data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(3), w(_w), h(_h), c(_c)
inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
: data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
{
cstep = alignSize(w * h * elemsize, 16) / elemsize;
}
@@ -1335,19 +1318,13 @@ inline VkMat& VkMat::operator=(const VkMat& m)
if (m.refcount)
NCNN_XADD(m.refcount, 1);

if (m.staging_refcount)
NCNN_XADD(m.staging_refcount, 1);

release();

data = m.data;
staging_data = m.staging_data;
refcount = m.refcount;
staging_refcount = m.staging_refcount;
elemsize = m.elemsize;
elempack = m.elempack;
allocator = m.allocator;
staging_allocator = m.staging_allocator;

dims = m.dims;
w = m.w;
@@ -1359,9 +1336,9 @@ inline VkMat& VkMat::operator=(const VkMat& m)
return *this;
}

inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator)
{
if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator)
if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
return;

release();
@@ -1369,7 +1346,6 @@ inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkA
elemsize = _elemsize;
elempack = 1;
allocator = _allocator;
staging_allocator = _staging_allocator;

dims = 1;
w = _w;
@@ -1389,9 +1365,9 @@ inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkA
}
}

inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
{
if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator)
if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
return;

release();
@@ -1399,7 +1375,6 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _alloca
elemsize = _elemsize;
elempack = 1;
allocator = _allocator;
staging_allocator = _staging_allocator;

dims = 2;
w = _w;
@@ -1419,9 +1394,9 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _alloca
}
}

inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
{
if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator)
if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
return;

release();
@@ -1429,7 +1404,6 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator*
elemsize = _elemsize;
elempack = 1;
allocator = _allocator;
staging_allocator = _staging_allocator;

dims = 3;
w = _w;
@@ -1449,9 +1423,9 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator*
}
}

inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
{
if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator)
if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
return;

release();
@@ -1459,7 +1433,6 @@ inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator*
elemsize = _elemsize;
elempack = _elempack;
allocator = _allocator;
staging_allocator = _staging_allocator;

dims = 1;
w = _w;
@@ -1479,9 +1452,9 @@ inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator*
}
}

inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
{
if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator)
if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
return;

release();
@@ -1489,7 +1462,6 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAll
elemsize = _elemsize;
elempack = _elempack;
allocator = _allocator;
staging_allocator = _staging_allocator;

dims = 2;
w = _w;
@@ -1509,9 +1481,9 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAll
}
}

inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
{
if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator)
if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
return;

release();
@@ -1519,7 +1491,6 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempac
elemsize = _elemsize;
elempack = _elempack;
allocator = _allocator;
staging_allocator = _staging_allocator;

dims = 3;
w = _w;
@@ -1539,82 +1510,33 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempac
}
}

inline void VkMat::create_like(const Mat& m, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create_like(const Mat& m, VkAllocator* _allocator)
{
int _dims = m.dims;
if (_dims == 1)
create(m.w, m.elemsize, m.elempack, _allocator, _staging_allocator);
create(m.w, m.elemsize, m.elempack, _allocator);
if (_dims == 2)
create(m.w, m.h, m.elemsize, m.elempack, _allocator, _staging_allocator);
create(m.w, m.h, m.elemsize, m.elempack, _allocator);
if (_dims == 3)
create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator, _staging_allocator);
create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
}

inline void VkMat::create_like(const VkMat& m, VkAllocator* _allocator, VkAllocator* _staging_allocator)
inline void VkMat::create_like(const VkMat& m, VkAllocator* _allocator)
{
int _dims = m.dims;
if (_dims == 1)
create(m.w, m.elemsize, m.elempack, _allocator, _staging_allocator);
create(m.w, m.elemsize, m.elempack, _allocator);
if (_dims == 2)
create(m.w, m.h, m.elemsize, m.elempack, _allocator, _staging_allocator);
create(m.w, m.h, m.elemsize, m.elempack, _allocator);
if (_dims == 3)
create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator, _staging_allocator);
}

inline void VkMat::prepare_staging_buffer()
{
if (allocator->mappable)
return;

if (staging_allocator && staging_data)
return;

size_t totalsize = alignSize(total() * elemsize, 4);
staging_data = staging_allocator->fastMalloc(totalsize);

staging_refcount = (int*)((unsigned char*)staging_data + offsetof(VkBufferMemory, refcount));
*staging_refcount = 1;
}

inline void VkMat::discard_staging_buffer()
{
if (allocator->mappable)
return;

if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1)
{
if (staging_allocator && staging_data)
{
staging_allocator->fastFree(staging_data);
}
}

staging_data = 0;
staging_refcount = 0;
}

inline void VkMat::upload(const Mat& m)
{
memcpy(mapped_ptr(), m.data, m.total() * m.elemsize);

if (allocator->mappable)
{
allocator->flush(data);
}
}

inline void VkMat::download(Mat& m) const
{
if (allocator->mappable)
{
allocator->invalidate(data);
}

memcpy(m.data, mapped_ptr(), total() * elemsize);
create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
}

inline Mat VkMat::mapped() const
{
if (!allocator->mappable)
return Mat();

if (dims == 1)
return Mat(w, mapped_ptr(), elemsize, elempack, 0);

@@ -1629,17 +1551,16 @@ inline Mat VkMat::mapped() const

inline void* VkMat::mapped_ptr() const
{
VkBufferMemory* mappable_data = allocator->mappable ? data : staging_data;
return (unsigned char*)mappable_data->mapped_ptr + mappable_data->offset;
if (!allocator->mappable)
return 0;

return (unsigned char*)data->mapped_ptr + data->offset;
}

inline void VkMat::addref()
{
if (refcount)
NCNN_XADD(refcount, 1);

if (staging_refcount)
NCNN_XADD(staging_refcount, 1);
}

inline void VkMat::release()
@@ -1652,16 +1573,7 @@ inline void VkMat::release()
}
}

if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1)
{
if (staging_allocator && staging_data)
{
staging_allocator->fastFree(staging_data);
}
}

data = 0;
staging_data = 0;

elemsize = 0;
elempack = 0;
@@ -1674,7 +1586,6 @@ inline void VkMat::release()
cstep = 0;

refcount = 0;
staging_refcount = 0;
}

inline bool VkMat::empty() const
@@ -1709,14 +1620,9 @@ inline size_t VkMat::buffer_offset() const
return data->offset;
}

inline VkBuffer VkMat::staging_buffer() const
{
return staging_data->buffer;
}

inline size_t VkMat::staging_buffer_offset() const
inline size_t VkMat::buffer_capacity() const
{
return staging_data->offset;
return data->capacity;
}

inline VkImageMat::VkImageMat()


+ 23
- 64
src/net.cpp View File

@@ -924,14 +924,16 @@ int Net::upload_model()
weight_staging_vkallocator = new VkWeightStagingBufferAllocator(vkdev);
}

cmd.weight_vkallocator = weight_vkallocator;
cmd.staging_vkallocator = weight_staging_vkallocator;
Option opt_upload = opt;
opt_upload.blob_vkallocator = weight_vkallocator;
opt_upload.workspace_vkallocator = weight_vkallocator;
opt_upload.staging_vkallocator = weight_staging_vkallocator;

for (size_t i=0; i<layers.size(); i++)
{
if (layers[i]->support_vulkan)
{
int uret = layers[i]->upload_model(cmd, opt);
int uret = layers[i]->upload_model(cmd, opt_upload);
if (uret != 0)
{
fprintf(stderr, "layer upload_model %d failed\n", (int)i);
@@ -1347,12 +1349,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

// upload
VkMat bottom_blob_unpacked;
bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator);

bottom_blob_unpacked.prepare_staging_buffer();
bottom_blob_unpacked.upload(bottom_blob_cpu_fp16);

cmd.record_upload(bottom_blob_unpacked);
cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt);

// cast to fp16 (integrated gpu)
VkMat bottom_blob_unpacked_fp16;
@@ -1390,11 +1387,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
if (layer->support_inplace && *bottom_blob.refcount != 1)
{
VkMat bottom_blob_copy;
bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

cmd.record_clone(bottom_blob, bottom_blob_copy, opt);
// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

cmd.record_clone(bottom_blob, bottom_blob_copy);
bottom_blob = bottom_blob_copy;
}
}
@@ -1437,7 +1431,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
{
// load bottom blobs
std::vector<VkMat> bottom_blobs(layer->bottoms.size());
std::vector<VkMat> bottom_blobs_unpacked(layer->bottoms.size());
for (size_t i=0; i<layer->bottoms.size(); i++)
{
int bottom_blob_index = layer->bottoms[i];
@@ -1471,13 +1464,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
}

// upload
VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i];
bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator);

bottom_blob_unpacked.prepare_staging_buffer();
bottom_blob_unpacked.upload(bottom_blob_cpu_fp16);

cmd.record_upload(bottom_blob_unpacked);
VkMat bottom_blob_unpacked;
cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt);

// cast to fp16 (integrated gpu)
VkMat bottom_blob_unpacked_fp16;
@@ -1515,11 +1503,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
if (layer->support_inplace && *bottom_blobs[i].refcount != 1)
{
VkMat bottom_blob_copy;
bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);

cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt);
// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

cmd.record_clone(bottom_blobs[i], bottom_blob_copy);
bottom_blobs[i] = bottom_blob_copy;
}
}
@@ -1602,11 +1587,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
if (layer->support_inplace && *bottom_blob.refcount != 1)
{
VkMat bottom_blob_copy;
bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

cmd.record_clone(bottom_blob, bottom_blob_copy, opt);
// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

cmd.record_clone(bottom_blob, bottom_blob_copy);
bottom_blob = bottom_blob_copy;
}
}
@@ -1614,7 +1596,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
VkMat bottom_blob_unpacked_fp16;
if (opt.use_packing_layout && layer->support_packing)
{
// bottom_blob_unpacked_fp16 = bottom_blob;
packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt);
}
else
@@ -1635,8 +1616,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
}

// download
bottom_blob_unpacked.prepare_staging_buffer();
cmd.record_download(bottom_blob_unpacked);
Mat bottom_blob_cpu_fp16;
cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt);

cmd.submit_and_wait();

@@ -1657,12 +1638,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

cmd.reset();

Mat bottom_blob_cpu_fp16;
bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator);
bottom_blob_unpacked.download(bottom_blob_cpu_fp16);

bottom_blob_unpacked.discard_staging_buffer();

// cast to fp32 (discrete gpu)
Mat& bottom_blob_cpu = blob_mats[bottom_blob_index];
if (opt.use_fp16_storage && vkdev->info.type == 0)
@@ -1742,7 +1717,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
else
{
// load bottom blobs
std::vector<VkMat> bottom_blobs_unpacked(layer->bottoms.size());
std::vector<Mat> bottom_blobs_cpu_fp16(layer->bottoms.size());
for (size_t i=0; i<layer->bottoms.size(); i++)
{
int bottom_blob_index = layer->bottoms[i];
@@ -1770,11 +1745,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
if (layer->support_inplace && *bottom_blob.refcount != 1)
{
VkMat bottom_blob_copy;
bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

cmd.record_clone(bottom_blob, bottom_blob_copy, opt);
// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

cmd.record_clone(bottom_blob, bottom_blob_copy);
bottom_blob = bottom_blob_copy;
}
}
@@ -1782,7 +1754,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
VkMat bottom_blob_unpacked_fp16;
if (opt.use_packing_layout && layer->support_packing)
{
// bottom_blob_unpacked_fp16 = bottom_blob;
packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt);
}
else
@@ -1792,7 +1763,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
}

// cast to fp32 (integrated gpu)
VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i];
VkMat bottom_blob_unpacked;
if (opt.use_fp16_storage && vkdev->info.type != 0)
{
cast_float16_to_float32->forward(bottom_blob_unpacked_fp16, bottom_blob_unpacked, cmd, opt);
@@ -1803,8 +1774,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
}

// download
bottom_blob_unpacked.prepare_staging_buffer();
cmd.record_download(bottom_blob_unpacked);
Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i];
cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt);
}
}
}
@@ -1837,13 +1808,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

if (blob_mats[bottom_blob_index].dims == 0)
{
VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i];

Mat bottom_blob_cpu_fp16;
bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator);
bottom_blob_unpacked.download(bottom_blob_cpu_fp16);

bottom_blob_unpacked.discard_staging_buffer();
const Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i];

// cast to fp32 (discrete gpu)
Mat& bottom_blob_cpu = blob_mats[bottom_blob_index];
@@ -1884,7 +1849,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
}
}

bottom_blobs_unpacked.clear();
bottom_blobs_cpu_fp16.clear();

// forward
if (opt.lightmode && layer->support_inplace)
@@ -2113,15 +2078,15 @@ int Extractor::extract(int blob_index, Mat& feat)
}

// download
feat_gpu_unpacked.prepare_staging_buffer();
cmd.record_download(feat_gpu_unpacked);
Mat feat_cpu_fp16;
cmd.record_download(feat_gpu_unpacked, feat_cpu_fp16, opt);

cmd.submit_and_wait();

#if NCNN_BENCHMARK
std::vector<uint64_t> results(net->layers.size() * 2);
cmd.get_query_pool_results(0, net->layers.size() * 2, results);
for (int i=0; i<net->layers.size(); i++)
for (size_t i=0; i<net->layers.size(); i++)
{
uint64_t start = results[i*2];
uint64_t end = results[i*2+1];
@@ -2133,12 +2098,6 @@ int Extractor::extract(int blob_index, Mat& feat)
}
#endif // NCNN_BENCHMARK

Mat feat_cpu_fp16;
feat_cpu_fp16.create_like(feat_gpu_unpacked, opt.blob_allocator);
feat_gpu_unpacked.download(feat_cpu_fp16);

feat_gpu_unpacked.discard_staging_buffer();

// cast to fp32 (discrete gpu)
Mat& feat_cpu = blob_mats[blob_index];
if (opt.use_fp16_storage && net->vkdev->info.type == 0)


+ 1
- 1
src/pipeline.cpp View File

@@ -267,7 +267,7 @@ int Pipeline::create_pipeline_layout(int push_constant_count)
VkPushConstantRange pushConstantRange;
pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
pushConstantRange.offset = 0;
pushConstantRange.size = sizeof(int) * push_constant_count;
pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count;

VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo;
pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;


+ 16
- 34
tests/test_cast.cpp View File

@@ -207,39 +207,30 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
a4_fp16 = a4;
}

// upload
ncnn::VkMat a4_gpu;
a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator);
a4_gpu.prepare_staging_buffer();
a4_gpu.upload(a4_fp16);

// forward
ncnn::VkCompute cmd(vkdev);

cmd.record_upload(a4_gpu);
// upload
ncnn::VkMat a4_gpu;
cmd.record_upload(a4_fp16, a4_gpu, opt);

ncnn::VkMat d4_gpu;
if (op->support_inplace)
{
d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator);
cmd.record_clone(a4_gpu, d4_gpu);
op->forward_inplace(d4_gpu, cmd, opt);
op->forward_inplace(a4_gpu, cmd, opt);
d4_gpu = a4_gpu;
}
else
{
op->forward(a4_gpu, d4_gpu, cmd, opt);
}

d4_gpu.prepare_staging_buffer();

cmd.record_download(d4_gpu);
// download
cmd.record_download(d4_gpu, d, opt);

cmd.submit_and_wait();

// download
d.create_like(d4_gpu);
d4_gpu.download(d);

op->destroy_pipeline(opt);

delete op;
@@ -331,39 +322,30 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type
a4_fp16 = a4;
}

// upload
ncnn::VkMat a4_gpu;
a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator);
a4_gpu.prepare_staging_buffer();
a4_gpu.upload(a4_fp16);

// forward
ncnn::VkCompute cmd(vkdev);

cmd.record_upload(a4_gpu);
// upload
ncnn::VkMat a4_gpu;
cmd.record_upload(a4_fp16, a4_gpu, opt);

ncnn::VkMat d4_gpu;
if (op->support_inplace)
{
d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator);
cmd.record_clone(a4_gpu, d4_gpu);
op->forward_inplace(d4_gpu, cmd, opt);
op->forward_inplace(a4_gpu, cmd, opt);
d4_gpu = a4_gpu;
}
else
{
op->forward(a4_gpu, d4_gpu, cmd, opt);
}

d4_gpu.prepare_staging_buffer();

cmd.record_download(d4_gpu);
// download
cmd.record_download(d4_gpu, d, opt);

cmd.submit_and_wait();

// download
d.create_like(d4_gpu);
d4_gpu.download(d);

op->destroy_pipeline(opt);

delete op;


+ 27
- 54
tests/testutil.h View File

@@ -261,10 +261,13 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
if (opt.use_vulkan_compute)
{
ncnn::VkTransfer cmd(vkdev);
cmd.weight_vkallocator = &g_weight_vkallocator;
cmd.staging_vkallocator = &g_weight_staging_vkallocator;

op->upload_model(cmd, opt);
ncnn::Option opt_upload = opt;
opt_upload.blob_vkallocator = &g_weight_vkallocator;
opt_upload.workspace_vkallocator = &g_weight_vkallocator;
opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;

op->upload_model(cmd, opt_upload);

cmd.submit_and_wait();
}
@@ -367,57 +370,35 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
}
}

// upload
std::vector<ncnn::VkMat> a4_fp16_gpu(a4_fp16.size());
for (size_t i=0; i<a4_fp16.size(); i++)
{
a4_fp16_gpu[i].create_like(a4_fp16[i], opt.blob_vkallocator, opt.staging_vkallocator);
a4_fp16_gpu[i].prepare_staging_buffer();
a4_fp16_gpu[i].upload(a4_fp16[i]);
}

// forward
ncnn::VkCompute cmd(vkdev);

// upload
std::vector<ncnn::VkMat> a4_fp16_gpu(a4_fp16.size());
for (size_t i=0; i<a4_fp16_gpu.size(); i++)
{
cmd.record_upload(a4_fp16_gpu[i]);
cmd.record_upload(a4_fp16[i], a4_fp16_gpu[i], opt);
}

std::vector<ncnn::VkMat> d4_fp16_gpu(top_blob_count);
if (op->support_inplace)
{
for (size_t i=0; i<a4_fp16_gpu.size(); i++)
{
d4_fp16_gpu[i].create_like(a4_fp16_gpu[i], a4_fp16_gpu[i].allocator, a4_fp16_gpu[i].staging_allocator);
cmd.record_clone(a4_fp16_gpu[i], d4_fp16_gpu[i]);
}
op->forward_inplace(a4_fp16_gpu, cmd, opt);

op->forward_inplace(d4_fp16_gpu, cmd, opt);
d4_fp16_gpu = a4_fp16_gpu;
}
else
{
op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt);
}

// download
for (size_t i=0; i<d4_fp16_gpu.size(); i++)
{
d4_fp16_gpu[i].prepare_staging_buffer();
}

for (size_t i=0; i<d4_fp16_gpu.size(); i++)
{
cmd.record_download(d4_fp16_gpu[i]);
cmd.record_download(d4_fp16_gpu[i], d[i], opt);
}

cmd.submit_and_wait();

// download
for (size_t i=0; i<d4_fp16_gpu.size(); i++)
{
d[i].create_like(d4_fp16_gpu[i]);
d4_fp16_gpu[i].download(d[i]);
}
}
#endif // NCNN_VULKAN

@@ -509,14 +490,15 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
if (opt.use_vulkan_compute)
{
ncnn::VkTransfer cmd(vkdev);
cmd.weight_vkallocator = &g_weight_vkallocator;
cmd.staging_vkallocator = &g_weight_staging_vkallocator;

op->upload_model(cmd, opt);
ncnn::Option opt_upload = opt;
opt_upload.blob_vkallocator = &g_weight_vkallocator;
opt_upload.workspace_vkallocator = &g_weight_vkallocator;
opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;

cmd.submit_and_wait();
op->upload_model(cmd, opt_upload);

g_weight_staging_vkallocator.clear();
cmd.submit_and_wait();
}
#endif // NCNN_VULKAN

@@ -594,38 +576,29 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
a4_fp16 = a4;
}

// upload
ncnn::VkMat a4_fp16_gpu;
a4_fp16_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator);
a4_fp16_gpu.prepare_staging_buffer();
a4_fp16_gpu.upload(a4_fp16);

// forward
ncnn::VkCompute cmd(vkdev);

cmd.record_upload(a4_fp16_gpu);
// upload
ncnn::VkMat a4_fp16_gpu;
cmd.record_upload(a4_fp16, a4_fp16_gpu, opt);

ncnn::VkMat d4_fp16_gpu;
if (op->support_inplace)
{
d4_fp16_gpu.create_like(a4_fp16_gpu, a4_fp16_gpu.allocator, a4_fp16_gpu.staging_allocator);
cmd.record_clone(a4_fp16_gpu, d4_fp16_gpu);
op->forward_inplace(d4_fp16_gpu, cmd, opt);
op->forward_inplace(a4_fp16_gpu, cmd, opt);
d4_fp16_gpu = a4_fp16_gpu;
}
else
{
op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt);
}

d4_fp16_gpu.prepare_staging_buffer();

cmd.record_download(d4_fp16_gpu);
// download
cmd.record_download(d4_fp16_gpu, d, opt);

cmd.submit_and_wait();

// download
d.create_like(d4_fp16_gpu);
d4_fp16_gpu.download(d);
}
#endif // NCNN_VULKAN



Loading…
Cancel
Save