Merge remote-tracking branch 'origin/fast_math' into fast_math

11 months ago · 20ccc1a5db
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -3759,10 +3759,11 @@ VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, siz
    if (fast_math_flag != 0)
    {
        std::vector<uint32_t> buffer;
        inject_fast_math(spv_data_modified, spv_data_size_modified, buffer,fast_math_flag);
        inject_fast_math(spv_data_modified, spv_data_size_modified, buffer, fast_math_flag);

        shader_module = compile_shader_module(buffer.data(), buffer.size() * sizeof(uint32_t));
    } else
    }
    else
    {
        shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
    }
--- a/src/option.h
+++ b/src/option.h
@@ -61,14 +61,14 @@ public:
    {
        // Base
        VK_FAST_MATH_FLAG_DISABLE = 0x0,
        VK_FAST_MATH_FLAG_NotNaN = 0x1, // Assume parameters and result are not NaN. If this assumption does not hold then the operation returns an undefined value.
        VK_FAST_MATH_FLAG_NotInf = 0x2, // Assume parameters and result are not +/- Inf. If this assumption does not hold then the operation returns an undefined value.
        VK_FAST_MATH_FLAG_NSZ = 0x4, // Treat the sign of a zero parameter or result as insignificant.
        VK_FAST_MATH_FLAG_NotNaN = 0x1,     // Assume parameters and result are not NaN. If this assumption does not hold then the operation returns an undefined value.
        VK_FAST_MATH_FLAG_NotInf = 0x2,     // Assume parameters and result are not +/- Inf. If this assumption does not hold then the operation returns an undefined value.
        VK_FAST_MATH_FLAG_NSZ = 0x4,        // Treat the sign of a zero parameter or result as insignificant.
        VK_FAST_MATH_FLAG_AllowRecip = 0x8, // Allow the usage of reciprocal rather than perform a division.
        VK_FAST_MATH_FLAG_Fast = 0x10, // Allow algebraic transformations according to real-number associative and distributive algebra. This flag implies above;
        VK_FAST_MATH_FLAG_Fast = 0x10,      // Allow algebraic transformations according to real-number associative and distributive algebra. This flag implies above;
        // FloatControls2
        VK_FAST_MATH_FLAG_AllowContract = 0x10000, // Allows a floating-point operation to be contracted with any operation(s) producing its operands. Rounding steps may be eliminated or may preserve higher bit-depth than the specified types. The instructions producing the operands do not need to be decorated to allow this transformation.
        VK_FAST_MATH_FLAG_AllowReassoc = 0x20000, // Allows a floating-point operation to be reordered with any operation(s) producing its operands according to real-number associativity rules. The instructions producing the operands do not need to be decorated to allow this transformation.
        VK_FAST_MATH_FLAG_AllowContract = 0x10000,  // Allows a floating-point operation to be contracted with any operation(s) producing its operands. Rounding steps may be eliminated or may preserve higher bit-depth than the specified types. The instructions producing the operands do not need to be decorated to allow this transformation.
        VK_FAST_MATH_FLAG_AllowReassoc = 0x20000,   // Allows a floating-point operation to be reordered with any operation(s) producing its operands according to real-number associativity rules. The instructions producing the operands do not need to be decorated to allow this transformation.
        VK_FAST_MATH_FLAG_AllowTransform = 0x40000, // Allows a floating-point operation to be transformed with any operation(s) producing its operands according to real-number rules. This is a superset of AllowContract and AllowReassoc and those bits must be set whenever this bit is set. The instructions producing the operands do not need to be decorated to allow this transformation, but note that non-trivial transformations may require multiple instructions to be decorated.
    };

--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -223,7 +223,7 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::
    // get from pipeline cache
    return pipeline_cache->get_pipeline(spv_data, spv_data_size, specializations, d->local_size_x, d->local_size_y, d->local_size_z, d->subgroup_size,
                                        &d->shader_module, &d->descriptorset_layout, &d->pipeline_layout, &d->pipeline, &d->descriptor_update_template,
                                        d->shader_info,fast_math_flag);
                                        d->shader_info, fast_math_flag);
 }

 int Pipeline::create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations)
--- a/src/pipelinecache.cpp
+++ b/src/pipelinecache.cpp
@@ -119,7 +119,7 @@ public:
 };

 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
                                                                   uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size, uint32_t _fast_math_flag)
        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size, uint32_t _fast_math_flag)
 {
    spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4);

@@ -139,7 +139,7 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
 }

 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
                                                                   uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
 {
    shader_type_index = _shader_type_index;

--- a/tests/test_fast_math.cpp
+++ b/tests/test_fast_math.cpp
@@ -108,7 +108,6 @@ static int test_vulkan_fast_math()
    net_fast_math.load_model(dr);
    printf("Fast math net loaded successfully.\n");


    // ==================================================
    // 3. Warm-up Run
    // ==================================================
@@ -128,7 +127,6 @@ static int test_vulkan_fast_math()
    }
    printf("Warm-up complete.\n");


    // ==================================================
    // 4. Benchmark Performance
    // ==================================================
@@ -208,7 +206,6 @@ int main(int argc, char** argv)
        device_index = atoi(argv[1]);
    }


    int gpu_count = ncnn::get_gpu_count();
    if (device_index < 0 || device_index >= gpu_count)
    {