diff --git a/src/gpu.cpp b/src/gpu.cpp
index b9e60e98a..440a596cd 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -520,11 +520,12 @@ int create_gpu_instance()
 //         fprintf(stderr, "[%u] deviceName = %s\n", i, physicalDeviceProperties.deviceName);
 //         fprintf(stderr, "[%u] pipelineCacheUUID = %u\n", i, physicalDeviceProperties.pipelineCacheUUID);
 
+        gpu_info.bug_local_size_spec_const = false;
+
         if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66))
         {
-            // ignore arm mali with old buggy driver
-            fprintf(stderr, "arm mali driver is too old\n");
-            continue;
+            // arm mali with old buggy driver
+            gpu_info.bug_local_size_spec_const = true;
         }
 
         if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 49))
@@ -762,9 +763,10 @@ int create_gpu_instance()
             gpu_info.support_fp16_storage = false;
         }
 
-        fprintf(stderr, "[%u %s]  queueC=%u[%u]  queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName,
+        fprintf(stderr, "[%u %s]  queueC=%u[%u]  queueT=%u[%u]  buglssc=%d\n", i, physicalDeviceProperties.deviceName,
                 gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
-                gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
+                gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count,
+                gpu_info.bug_local_size_spec_const);
 
         fprintf(stderr, "[%u %s]  fp16p=%d  fp16s=%d  fp16a=%d  int8s=%d  int8a=%d\n", i, physicalDeviceProperties.deviceName,
                 gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
@@ -1049,6 +1051,32 @@ VkShaderModule VulkanDevice::get_shader_module(const char* name) const
     return 0;
 }
 
+VkShaderModule VulkanDevice::create_shader_module(const char* name, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
+{
+    const uint32_t* spv_data = 0;
+    size_t spv_data_size = 0;
+
+    for (int i=0; i<layer_shader_registry_entry_count; i++)
+    {
+        const char* shader_name = layer_shader_registry[i].name;
+
+        if (strcmp(shader_name, name) == 0)
+        {
+            spv_data = layer_shader_registry[i].spv_data;
+            spv_data_size = layer_shader_registry[i].spv_data_size;
+            break;
+        }
+    }
+
+    if (!spv_data)
+    {
+        fprintf(stderr, "no such shader module %s\n", name);
+        return 0;
+    }
+
+    return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+}
+
 VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
 {
     VkShaderModuleCreateInfo shaderModuleCreateInfo;
@@ -1069,6 +1097,116 @@ VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, siz
     return shader_module;
 }
 
+static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
+{
+    uint32_t local_size_x_id = -1;
+    uint32_t local_size_y_id = -1;
+    uint32_t local_size_z_id = -1;
+    uint32_t gl_WorkGroupSize_id = -1;
+
+    const uint32_t* p = code;
+    uint32_t* dp = dstcode;
+
+    // skip magic version generator bound schema
+    memcpy(dp, p, 5 * sizeof(uint32_t));
+    p += 5;
+    dp += 5;
+
+    // foreach op
+    while ((const unsigned char*)p < (const unsigned char*)code + size)
+    {
+        uint32_t opcode = p[0];
+
+        uint16_t wordcount = opcode >> 16;
+        uint16_t op = opcode & 0xffff;
+
+        if (op == 16) // OpExecutionMode
+        {
+            uint32_t mode = p[2];
+            if (mode == 17) // LocalSize
+            {
+                memcpy(dp, p, wordcount * sizeof(uint32_t));
+
+                // set local_size_xyz
+                dp[3] = local_size_x;
+                dp[4] = local_size_y;
+                dp[5] = local_size_z;
+
+                p += wordcount;
+                dp += wordcount;
+                continue;
+            }
+        }
+        else if (op == 50) // OpSpecConstant
+        {
+            uint32_t id = p[2];
+            if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id)
+            {
+                p += wordcount;
+                continue;
+            }
+        }
+        else if (op == 51) // OpSpecConstantComposite
+        {
+            uint32_t id = p[2];
+            if (id == gl_WorkGroupSize_id)
+            {
+                if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id))
+                {
+                    p += wordcount;
+                    continue;
+                }
+            }
+        }
+        else if (op == 71) // OpDecorate
+        {
+            uint32_t id = p[1];
+            uint32_t decoration = p[2];
+            if (decoration == 1) // SpecId
+            {
+                uint32_t specid = p[3];
+                if (specid == 233) local_size_x_id = id;
+                if (specid == 234) local_size_y_id = id;
+                if (specid == 235) local_size_z_id = id;
+                if (specid == 233 || specid == 234 || specid == 235)
+                {
+                    p += wordcount;
+                    continue;
+                }
+            }
+            else if (decoration == 11) // BuiltIn
+            {
+                uint32_t builtin = p[3];
+                if (builtin == 25) // WorkgroupSize
+                {
+                    gl_WorkGroupSize_id = id;
+                    p += wordcount;
+                    continue;
+                }
+            }
+        }
+
+        memcpy(dp, p, wordcount * sizeof(uint32_t));
+        p += wordcount;
+        dp += wordcount;
+    }
+
+    *dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
+}
+
+VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
+{
+    uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
+    size_t spv_data_size_modified = spv_data_size;
+    inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);
+
+    VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
+
+    free(spv_data_modified);
+
+    return shader_module;
+}
+
 uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
 {
     // first try, find required and with preferred and without preferred_not
@@ -1293,6 +1431,12 @@ static inline bool string_ends_with_fp16a(const char* name)
 
 int VulkanDevice::create_shader_module()
 {
+    if (info.bug_local_size_spec_const)
+    {
+        // do not cache shader module
+        return 0;
+    }
+
     shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE);
 
     for (int i=0; i<layer_shader_registry_entry_count; i++)
diff --git a/src/gpu.h b/src/gpu.h
index 8b5d88386..3694b5e52 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -110,6 +110,9 @@ public:
     uint32_t graphics_queue_count;
     uint32_t transfer_queue_count;
 
+    // bug is not feature
+    bool bug_local_size_spec_const;
+
     // fp16 and int8 feature
     bool support_fp16_packed;
     bool support_fp16_storage;
@@ -156,8 +159,14 @@ public:
 
     VkShaderModule get_shader_module(const char* name) const;
 
+    // with fixed workgroup size
+    VkShaderModule create_shader_module(const char* name, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
     VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
 
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
     uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
     bool is_mappable(uint32_t memory_type_index) const;
 
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index 81d4a2251..de5a3dde0 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -48,7 +48,14 @@ Pipeline::~Pipeline()
 
 int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const char* entry_name, const std::vector<vk_specialization_type>& specializations, int binding_count, int push_constant_count)
 {
-    local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
+    if (vkdev->info.bug_local_size_spec_const)
+    {
+        local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+    }
+    else
+    {
+        local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
+    }
 
 //     fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name);
 
@@ -88,6 +95,13 @@ int Pipeline::create(const char* _name, const Option& opt, const std::vector<vk_
         name += "_fp16p";
     }
 
+    if (vkdev->info.bug_local_size_spec_const)
+    {
+        local_shader_module = vkdev->create_shader_module(name.c_str(), local_size_x, local_size_y, local_size_z);
+
+        return create(local_shader_module, name.c_str(), specializations, binding_count, push_constant_count);
+    }
+
     VkShaderModule shader_module = vkdev->get_shader_module(name.c_str());
 
     return create(shader_module, name.c_str(), specializations, binding_count, push_constant_count);
@@ -324,6 +338,7 @@ int Pipeline::create_pipeline(VkShaderModule shader_module, const char* entry_na
     std::vector<vk_specialization_type> specialization_data = specializations;
 
     // append local_size_xyz specialization
+    if (!vkdev->info.bug_local_size_spec_const)
     {
         VkSpecializationMapEntry* local_size_xyz_entries = specializationMapEntries.data() + specialization_count;