diff --git a/src/gpu.cpp b/src/gpu.cpp
index 6dbed566c..6f65e5b3b 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -503,7 +503,7 @@ int create_gpu_instance()
         gpu_info.host_visible_memory_index = find_host_visible_memory(physicalDeviceMemoryProperties);
 
         // treat as unified memory architecture if memory heap is the same
-        if (gpu_info.unified_memory_index != -1)
+        if (gpu_info.unified_memory_index != (uint32_t)-1)
         {
             int unified_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.unified_memory_index].heapIndex;
             int device_local_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.device_local_memory_index].heapIndex;
diff --git a/src/layer/flatten.cpp b/src/layer/flatten.cpp
index 17baaad3b..f3f40ccb5 100644
--- a/src/layer/flatten.cpp
+++ b/src/layer/flatten.cpp
@@ -25,6 +25,7 @@ Flatten::Flatten()
     support_vulkan = true;
 
 #if NCNN_VULKAN
+    pipeline_flatten = 0;
     pipeline_flatten_pack4 = 0;
 #endif // NCNN_VULKAN
 }
@@ -61,6 +62,13 @@ int Flatten::create_pipeline()
 {
     std::vector<vk_specialization_type> specializations;
 
+    // pack1
+    {
+        pipeline_flatten = new Pipeline(vkdev);
+        pipeline_flatten->set_optimal_local_size_xyz();
+        pipeline_flatten->create("flatten", specializations, 2, 10);
+    }
+
     // pack4
     {
         pipeline_flatten_pack4 = new Pipeline(vkdev);
@@ -73,6 +81,9 @@ int Flatten::create_pipeline()
 
 int Flatten::destroy_pipeline()
 {
+    delete pipeline_flatten;
+    pipeline_flatten = 0;
+
     delete pipeline_flatten_pack4;
     pipeline_flatten_pack4 = 0;
 
@@ -89,73 +100,55 @@ int Flatten::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd,
         return 0;
     }
 
-    if (dims == 2)
-    {
-        top_blob = bottom_blob;
-        top_blob.dims = 1;
-        top_blob.w = bottom_blob.w * bottom_blob.h;
-        top_blob.h = 1;
-        return 0;
-    }
-
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
     size_t elemsize = bottom_blob.elemsize;
     int packing = bottom_blob.packing;
 
-    int out_packing = (w * h * channels * packing) % 4 == 0 ? 4 : 1;
-    size_t out_elemsize = elemsize / packing * out_packing;
+    int total = w * h * channels * packing;
 
-    top_blob.create(w * h * channels * packing / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
-    if (top_blob.empty())
-        return -100;
+    int out_packing = total % 4 == 0 ? 4 : 1;
+    size_t out_elemsize = elemsize / packing * out_packing;
 
-    if (packing == 4 && out_packing == 4)
+    if (dims == 2 && packing == 1)
     {
-        std::vector<VkMat> bindings(2);
-        bindings[0] = bottom_blob;
-        bindings[1] = top_blob;
-
-        std::vector<vk_constant_type> constants(10);
-        constants[0].i = bottom_blob.dims;
-        constants[1].i = bottom_blob.w;
-        constants[2].i = bottom_blob.h;
-        constants[3].i = bottom_blob.c;
-        constants[4].i = bottom_blob.cstep;
-        constants[5].i = top_blob.dims;
-        constants[6].i = top_blob.w;
-        constants[7].i = top_blob.h;
-        constants[8].i = top_blob.c;
-        constants[9].i = top_blob.cstep;
-
-        // record
-        cmd.record_prepare_compute_barrier(bottom_blob);
-        cmd.record_prepare_compute_barrier(top_blob);
-        cmd.record_pipeline(pipeline_flatten_pack4, bindings, constants, top_blob);
-
+        top_blob = bottom_blob;
+        top_blob.dims = 1;
+        top_blob.w = total / out_packing;
+        top_blob.h = 1;
+        top_blob.cstep = top_blob.w;
+        top_blob.elemsize = out_elemsize;
+        top_blob.packing = out_packing;
         return 0;
     }
 
-    std::vector<VkBufferCopy> regions(channels);
-
-    int srcOffset = 0;
-    int dstOffset = 0;
-    for (int q=0; q<channels; q++)
-    {
-        int size = w * h * elemsize;
-
-        regions[q].srcOffset = bottom_blob.buffer_offset() + srcOffset;
-        regions[q].dstOffset = top_blob.buffer_offset() + dstOffset;
-        regions[q].size = size;
-
-        srcOffset += bottom_blob.cstep * elemsize;
-        dstOffset += size;
-    }
+    top_blob.create(total / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
+    if (top_blob.empty())
+        return -100;
 
-    cmd.record_prepare_transfer_barrier(bottom_blob);
-    cmd.record_prepare_transfer_barrier(top_blob);
-    cmd.record_copy_regions(bottom_blob, top_blob, regions);
+    std::vector<VkMat> bindings(2);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = bottom_blob.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = (packing == 1 && out_packing == 4) ? top_blob.w * out_packing : top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten;
+
+    // record
+    cmd.record_prepare_compute_barrier(bottom_blob);
+    cmd.record_prepare_compute_barrier(top_blob);
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
 
     return 0;
 }
diff --git a/src/layer/flatten.h b/src/layer/flatten.h
index 41f7dbb16..60ecadd1b 100644
--- a/src/layer/flatten.h
+++ b/src/layer/flatten.h
@@ -35,6 +35,7 @@ public:
 
 public:
 #if NCNN_VULKAN
+    Pipeline* pipeline_flatten;
     Pipeline* pipeline_flatten_pack4;
 #endif // NCNN_VULKAN
 };
diff --git a/src/layer/packing.cpp b/src/layer/packing.cpp
index 0b62544f9..ec5941325 100644
--- a/src/layer/packing.cpp
+++ b/src/layer/packing.cpp
@@ -76,6 +76,14 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
 
     if (dims == 1)
     {
+        if (out_packing == 1)
+        {
+            top_blob = bottom_blob;
+            top_blob.w = w * packing;
+            top_blob.elemsize = elemsize / packing;
+            return 0;
+        }
+
         int outw = (w * packing + out_packing - 1) / out_packing;
         size_t out_elemsize = elemsize / packing * out_packing;
 
diff --git a/src/layer/shader/flatten.comp b/src/layer/shader/flatten.comp
new file mode 100644
index 000000000..b35c1b95a
--- /dev/null
+++ b/src/layer/shader/flatten.comp
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= 1 || gz >= 1)
+        return;
+
+    int size = p.w * p.h;
+
+    int z = gx / size;
+    int y = gx % size / p.w;
+    int x = gx % size % p.w;
+
+    int v_offset = z * p.cstep + y * p.w + x;
+
+    top_blob_data[gx] = bottom_blob_data[v_offset];
+}
diff --git a/src/layer/shader/flatten_pack4.comp b/src/layer/shader/flatten_pack4.comp
index 114c78da2..216b63057 100644
--- a/src/layer/shader/flatten_pack4.comp
+++ b/src/layer/shader/flatten_pack4.comp
@@ -45,20 +45,32 @@ void main()
     if (gx >= p.outw || gy >= 1 || gz >= 1)
         return;
 
-    const int size = p.w * p.h;
+    ivec4 v_offset;
 
-    const int z = (gx / size) * 4;
-    const int i = (gx % size) * 4;
+    if (p.dims == 2)
+    {
+        int y = gx / p.w;
+        ivec4 i4 = (gx % p.w) * 4 + ivec4(0, 1, 2, 3);
 
-    ivec4 i4 = ivec4(i, i+1, i+2, i+3);
+        ivec4 k4 = i4 / p.w;
+        ivec4 si4 = i4 % p.w;
 
-    ivec4 k4 = i4 / size;
-    ivec4 si4 = i4 % size;
+        v_offset = ivec4(y * 4 * p.w) + si4 * 4 + k4;
+    }
+    else // if (p.dims == 3)
+    {
+        int size = p.w * p.h;
 
-    ivec4 v_offset = ivec4(z * p.cstep) + si4 * 4 + k4;
+        int z = gx / size;
+        ivec4 i4 = (gx % size) * 4 + ivec4(0, 1, 2, 3);
 
-    vec4 v;
+        ivec4 k4 = i4 / size;
+        ivec4 si4 = i4 % size;
+
+        v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4;
+    }
 
+    vec4 v;
     v.r = bottom_blob_data[v_offset.r];
     v.g = bottom_blob_data[v_offset.g];
     v.b = bottom_blob_data[v_offset.b];
diff --git a/src/mat.h b/src/mat.h
index f9b2e2188..a820d1cfd 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -1412,6 +1412,9 @@ inline void VkMat::prepare_staging_buffer()
 
 inline void VkMat::discard_staging_buffer()
 {
+    if (allocator->mappable)
+        return;
+
     if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1)
     {
         if (staging_allocator && staging_data)