rough vulkan gemm and multiheadattention (#4618)

3 years ago · b640574b88
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -1185,7 +1185,7 @@ y = affine(out)
 | param id  | name          | type  | default   | description       |
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | embed_dim     | int   | 0         |                   |
 | 1         | num_head      | int   | 1         |                   |
 | 1         | num_heads     | int   | 1         |                   |
 | 2         | weight_data_size| int | 0         |                   |
 | 3         | kdim          | int   | embed_dim |                   |
 | 4         | vdim          | int   | embed_dim |                   |
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -68,7 +68,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt)
        Option optopt = optn;

        {
            const int embed_dim_per_head = embed_dim / num_head;
            const int embed_dim_per_head = embed_dim / num_heads;
            const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head);

            q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
@@ -240,7 +240,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt)
    optopt.use_fp16_storage = false;

    {
        const int embed_dim_per_head = embed_dim / num_head;
        const int embed_dim_per_head = embed_dim / num_heads;
        const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head);

        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
@@ -530,7 +530,7 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
    const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1];
    const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2];

    const int embed_dim_per_head = embed_dim / num_head;
    const int embed_dim_per_head = embed_dim / num_heads;
    const int src_seqlen = q_blob.h * q_blob.elempack;
    const int dst_seqlen = k_blob.h * k_blob.elempack;

@@ -555,9 +555,9 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
        Mat k_affine;
        k_gemm->forward(k_blob, k_affine, optn);

        Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, optn.blob_allocator);
        Mat qk_cross(dst_seqlen, src_seqlen * num_heads, 2u, optn.blob_allocator);
        #pragma omp parallel for num_threads(optn.num_threads)
        for (int i = 0; i < num_head; i++)
        for (int i = 0; i < num_heads; i++)
        {
            std::vector<Mat> qk_bottom_blobs(2);
            qk_bottom_blobs[0] = q_affine.row_range(i * embed_dim_per_head, embed_dim_per_head);
@@ -577,9 +577,9 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
        Mat v_affine;
        v_gemm->forward(v_blob, v_affine, optn);

        Mat qkv_cross(src_seqlen, embed_dim_per_head * num_head, 2u, optn.blob_allocator);
        Mat qkv_cross(src_seqlen, embed_dim_per_head * num_heads, 2u, optn.blob_allocator);
        #pragma omp parallel for num_threads(optn.num_threads)
        for (int i = 0; i < num_head; i++)
        for (int i = 0; i < num_heads; i++)
        {
            std::vector<Mat> qkv_bottom_blobs(2);
            qkv_bottom_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen);
@@ -605,9 +605,9 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
    Mat k_affine;
    k_gemm->forward(k_blob, k_affine, opt32);

    Mat qk_cross(dst_seqlen, src_seqlen * num_head, 4u, opt32.blob_allocator);
    Mat qk_cross(dst_seqlen, src_seqlen * num_heads, 4u, opt32.blob_allocator);
    #pragma omp parallel for num_threads(opt32.num_threads)
    for (int i = 0; i < num_head; i++)
    for (int i = 0; i < num_heads; i++)
    {
        std::vector<Mat> qk_bottom_blobs(2);
        qk_bottom_blobs[0] = q_affine.row_range(i * embed_dim_per_head, embed_dim_per_head);
@@ -627,9 +627,9 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
    Mat v_affine;
    v_gemm->forward(v_blob, v_affine, opt32);

    Mat qkv_cross(src_seqlen, embed_dim_per_head * num_head, 4u, opt32.blob_allocator);
    Mat qkv_cross(src_seqlen, embed_dim_per_head * num_heads, 4u, opt32.blob_allocator);
    #pragma omp parallel for num_threads(opt32.num_threads)
    for (int i = 0; i < num_head; i++)
    for (int i = 0; i < num_heads; i++)
    {
        std::vector<Mat> qkv_bottom_blobs(2);
        qkv_bottom_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen);
--- a/src/layer/multiheadattention.cpp
+++ b/src/layer/multiheadattention.cpp
@@ -25,7 +25,7 @@ MultiHeadAttention::MultiHeadAttention()
 int MultiHeadAttention::load_param(const ParamDict& pd)
 {
    embed_dim = pd.get(0, 0);
    num_head = pd.get(1, 1);
    num_heads = pd.get(1, 1);
    weight_data_size = pd.get(2, 0);
    kdim = pd.get(3, embed_dim);
    vdim = pd.get(4, embed_dim);
@@ -79,7 +79,7 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto

    const int src_seqlen = q_blob.h;
    const int dst_seqlen = k_blob.h;
    const int embed_dim_per_head = embed_dim / num_head;
    const int embed_dim_per_head = embed_dim / num_heads;

    // assert k_blob.h == v_blob.h

@@ -88,18 +88,18 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
    if (top_blob.empty())
        return -1;

    Mat xq(embed_dim_per_head, src_seqlen, num_head, 4u, opt.workspace_allocator);
    Mat xk(embed_dim_per_head, dst_seqlen, num_head, 4u, opt.workspace_allocator);
    Mat xv(dst_seqlen, embed_dim_per_head, num_head, 4u, opt.workspace_allocator);
    Mat xq(embed_dim_per_head, src_seqlen, num_heads, 4u, opt.workspace_allocator);
    Mat xk(embed_dim_per_head, dst_seqlen, num_heads, 4u, opt.workspace_allocator);
    Mat xv(dst_seqlen, embed_dim_per_head, num_heads, 4u, opt.workspace_allocator);

    Mat xqk(dst_seqlen, src_seqlen, num_head, 4u, opt.workspace_allocator);
    Mat xqk(dst_seqlen, src_seqlen, num_heads, 4u, opt.workspace_allocator);

    Mat xqkv(embed_dim_per_head, num_head, src_seqlen, 4u, opt.workspace_allocator);
    Mat xqkv(embed_dim_per_head, num_heads, src_seqlen, 4u, opt.workspace_allocator);

    const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < num_head; q++)
    for (int q = 0; q < num_heads; q++)
    {
        // xq = affine(q) * inv_sqrt_embed_dim_per_head
        {
@@ -233,7 +233,7 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
        // xqkv = xqk * xv
        // xqk (dst_seqlen, src_seqlen)
        // xv  (dst_seqlen, embed_dim_per_head)
        // out (embed_dim_per_head, num_head, src_seqlen)
        // out (embed_dim_per_head, num_heads, src_seqlen)
        {
            const Mat xqkm = xqk.channel(q);
            const Mat xvm = xv.channel(q);
--- a/src/layer/multiheadattention.h
+++ b/src/layer/multiheadattention.h
@@ -32,7 +32,7 @@ public:

 public:
    int embed_dim;
    int num_head;
    int num_heads;
    int weight_data_size;
    int kdim;
    int vdim;
--- a/src/layer/vulkan/gemm_vulkan.cpp
+++ b/src/layer/vulkan/gemm_vulkan.cpp
@@ -0,0 +1,425 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "gemm_vulkan.h"

 #include "layer_shader_type.h"

 namespace ncnn {

 Gemm_vulkan::Gemm_vulkan()
 {
    support_vulkan = true;
    support_image_storage = true;

    pipeline_gemm = 0;
 }

 int Gemm_vulkan::create_pipeline(const Option& opt)
 {
    // const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0];

    // int elempack = 1;
    // if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;

    // size_t elemsize;
    // if (opt.use_fp16_storage)
    // {
    //     elemsize = elempack * 2u;
    // }
    // else if (opt.use_fp16_packed)
    // {
    //     elemsize = elempack == 1 ? 4u : elempack * 2u;
    // }
    // else
    // {
    //     elemsize = elempack * 4u;
    // }

    // Mat shape_packed;
    // if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);

    if (constantA)
    {
        A_data_packed = transA ? A_data.reshape(constantM, constantK) : A_data.reshape(constantK, constantM);
    }
    if (constantB)
    {
        B_data_packed = transB ? B_data.reshape(constantK, constantN) : B_data.reshape(constantN, constantK);
    }
    if (constantC)
    {
        C_data_packed = C_data;
    }

    std::vector<vk_specialization_type> specializations(15);
    specializations[0].f = alpha;
    specializations[1].f = beta;
    specializations[2].i = transA;
    specializations[3].i = transB;
    specializations[4].i = constantA;
    specializations[5].i = constantB;
    specializations[6].i = constantC;
    specializations[7].i = constantM;
    specializations[8].i = constantN;
    specializations[9].i = constantK;
    specializations[10].i = constant_broadcast_type_C;
    specializations[11].i = output_N1M;
    specializations[12].i = output_elempack;
    specializations[13].i = output_elemtype;
    specializations[14].i = output_transpose;

    Mat local_size_xyz;
    // if (shape_packed.dims == 2)
    // {
    //     local_size_xyz.w = std::min(8, shape_packed.w);
    //     local_size_xyz.h = std::min(8, shape_packed.h);
    //     local_size_xyz.c = 1;
    // }

    // pack1
    // if (shape.dims == 0 || elempack == 1)
    {
        pipeline_gemm = new Pipeline(vkdev);
        pipeline_gemm->set_optimal_local_size_xyz(local_size_xyz);
        if (opt.use_shader_local_memory)
        {
            pipeline_gemm->set_local_size_xyz(8, 8, 1);
        }
        pipeline_gemm->create(LayerShaderType::gemm, opt, specializations);
    }

    return 0;
 }

 int Gemm_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
    delete pipeline_gemm;
    pipeline_gemm = 0;

    return 0;
 }

 int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
    if (constantA)
    {
        if (support_image_storage && opt.use_image_storage)
        {
            cmd.record_upload(A_data_packed, A_data_gpu_image, opt);
        }
        else
        {
            cmd.record_upload(A_data_packed, A_data_gpu, opt);
        }

        A_data_packed.release();
    }

    if (constantB)
    {
        if (support_image_storage && opt.use_image_storage)
        {
            cmd.record_upload(B_data_packed, B_data_gpu_image, opt);
        }
        else
        {
            cmd.record_upload(B_data_packed, B_data_gpu, opt);
        }

        B_data_packed.release();
    }

    if (constantC)
    {
        if (support_image_storage && opt.use_image_storage)
        {
            cmd.record_upload(C_data_packed, C_data_gpu_image, opt);
        }
        else
        {
            cmd.record_upload(C_data_packed, C_data_gpu, opt);
        }

        C_data_packed.release();
    }

    return 0;
 }

 int Gemm_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
 {
    const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0];
    const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1];
    const VkMat& C0 = constantC ? C_data_gpu : bottom_blobs[bottom_blobs.size() - 1];

    VkMat A;
    VkMat B;
    VkMat C;
    vkdev->convert_packing(A0, A, 1, cmd, opt);
    vkdev->convert_packing(B0, B, 1, cmd, opt);
    vkdev->convert_packing(C0, C, 1, cmd, opt);

    const int M = constantM ? constantM : transA ? A.w : (A.dims == 3 ? A.c : A.h);
    const int K = constantK ? constantK : transA ? (A.dims == 3 ? A.c : A.h) : A.w;
    const int N = constantN ? constantN : transB ? (B.dims == 3 ? B.c : B.h) : B.w;

    int broadcast_type_C;
    if (constantC)
    {
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (C.dims == 1 && C.w == 1)
        {
            // scalar
            broadcast_type_C = 0;
        }
        if (C.dims == 1 && C.w == M)
        {
            // M
            // auto broadcast from h to w is the ncnn-style convention
            broadcast_type_C = 1;
        }
        if (C.dims == 1 && C.w == N)
        {
            // N
            broadcast_type_C = 4;
        }
        if (C.dims == 2 && C.w == 1 && C.h == M)
        {
            // Mx1
            broadcast_type_C = 2;
        }
        if (C.dims == 2 && C.w == N && C.h == M)
        {
            // MxN
            broadcast_type_C = 3;
        }
        if (C.dims == 2 && C.w == N && C.h == 1)
        {
            // 1xN
            broadcast_type_C = 4;
        }
    }

    int elempack = A.elempack;
    size_t elemsize = A.elemsize;

    VkMat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N, elemsize, opt.blob_vkallocator);
        else
            top_blob.create(M, N, elemsize, opt.blob_vkallocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M, elemsize, opt.blob_vkallocator);
        else
            top_blob.create(N, M, elemsize, opt.blob_vkallocator);
    }
    if (top_blob.empty())
        return -100;

    std::vector<VkMat> bindings(4);
    bindings[0] = top_blob;
    bindings[1] = A;
    bindings[2] = B;
    bindings[3] = C;

    std::vector<vk_constant_type> constants(10);
    constants[0].i = M;
    constants[1].i = N;
    constants[2].i = K;
    constants[3].i = broadcast_type_C;
    constants[4].i = A.dims;
    constants[5].i = A.dims == 3 ? A.cstep : transA ? M : K;
    constants[6].i = B.dims;
    constants[7].i = B.dims == 3 ? B.cstep : transB ? K : N;
    constants[8].i = top_blob.dims;
    constants[9].i = top_blob.dims == 3 ? top_blob.cstep : top_blob.w;

    const Pipeline* pipeline = pipeline_gemm;

    VkMat dispatcher;
    dispatcher.w = (N + 1) / 2;
    dispatcher.h = (M + 1) / 2;
    dispatcher.c = 1;
    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

    int out_elempack = 1;
    {
        int outh = output_transpose ? N : M;
        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
    }
    if (output_elempack)
        out_elempack = output_elempack;

    if (out_elempack != 1)
    {
        VkMat top_blob0;
        vkdev->convert_packing(top_blob, top_blob0, out_elempack, cmd, opt);
        top_blobs[0] = top_blob0;
    }

    return 0;
 }

 int Gemm_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
    std::vector<VkMat> bottom_blobs(1);
    std::vector<VkMat> top_blobs(1);
    bottom_blobs[0] = bottom_blob;
    int ret = forward(bottom_blobs, top_blobs, cmd, opt);
    top_blob = top_blobs[0];
    return ret;
 }

 int Gemm_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
 {
    const VkImageMat& A0 = constantA ? A_data_gpu_image : bottom_blobs[0];
    const VkImageMat& B0 = constantB ? B_data_gpu_image : constantA ? bottom_blobs[0] : bottom_blobs[1];
    const VkImageMat& C0 = constantC ? C_data_gpu_image : bottom_blobs[bottom_blobs.size() - 1];

    VkImageMat A;
    VkImageMat B;
    VkImageMat C;
    vkdev->convert_packing(A0, A, 1, cmd, opt);
    vkdev->convert_packing(B0, B, 1, cmd, opt);
    vkdev->convert_packing(C0, C, 1, cmd, opt);

    const int M = constantM ? constantM : transA ? A.w : (A.dims == 3 ? A.c : A.h);
    const int K = constantK ? constantK : transA ? (A.dims == 3 ? A.c : A.h) : A.w;
    const int N = constantN ? constantN : transB ? (B.dims == 3 ? B.c : B.h) : B.w;

    int broadcast_type_C;
    if (constantC)
    {
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (C.dims == 1 && C.w == 1)
        {
            // scalar
            broadcast_type_C = 0;
        }
        if (C.dims == 1 && C.w == M)
        {
            // M
            // auto broadcast from h to w is the ncnn-style convention
            broadcast_type_C = 1;
        }
        if (C.dims == 1 && C.w == N)
        {
            // N
            broadcast_type_C = 4;
        }
        if (C.dims == 2 && C.w == 1 && C.h == M)
        {
            // Mx1
            broadcast_type_C = 2;
        }
        if (C.dims == 2 && C.w == N && C.h == M)
        {
            // MxN
            broadcast_type_C = 3;
        }
        if (C.dims == 2 && C.w == N && C.h == 1)
        {
            // 1xN
            broadcast_type_C = 4;
        }
    }

    int elempack = A.elempack;
    size_t elemsize = A.elemsize;

    VkImageMat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N, elemsize, opt.blob_vkallocator);
        else
            top_blob.create(M, N, elemsize, opt.blob_vkallocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M, elemsize, opt.blob_vkallocator);
        else
            top_blob.create(N, M, elemsize, opt.blob_vkallocator);
    }
    if (top_blob.empty())
        return -100;

    std::vector<VkImageMat> bindings(4);
    bindings[0] = top_blob;
    bindings[1] = A;
    bindings[2] = B;
    bindings[3] = C;

    std::vector<vk_constant_type> constants(10);
    constants[0].i = M;
    constants[1].i = N;
    constants[2].i = K;
    constants[3].i = broadcast_type_C;
    constants[4].i = A.dims;
    constants[5].i = 0; //A.w;
    constants[6].i = B.dims;
    constants[7].i = 0; //B.w;
    constants[8].i = top_blob.dims;
    constants[9].i = 0; //top_blob.w;

    const Pipeline* pipeline = pipeline_gemm;

    VkImageMat dispatcher;
    dispatcher.w = (N + 1) / 2;
    dispatcher.h = (M + 1) / 2;
    dispatcher.c = 1;
    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

    int out_elempack = 1;
    {
        int outh = output_transpose ? N : M;
        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
    }
    if (output_elempack)
        out_elempack = output_elempack;

    if (out_elempack != 1)
    {
        VkImageMat top_blob0;
        vkdev->convert_packing(top_blob, top_blob0, out_elempack, cmd, opt);
        top_blobs[0] = top_blob0;
    }

    return 0;
 }

 int Gemm_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
    std::vector<VkImageMat> bottom_blobs(1);
    std::vector<VkImageMat> top_blobs(1);
    bottom_blobs[0] = bottom_blob;
    int ret = forward(bottom_blobs, top_blobs, cmd, opt);
    top_blob = top_blobs[0];
    return ret;
 }

 } // namespace ncnn
--- a/src/layer/vulkan/gemm_vulkan.h
+++ b/src/layer/vulkan/gemm_vulkan.h
@@ -0,0 +1,56 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_GEMM_VULKAN_H
 #define LAYER_GEMM_VULKAN_H

 #include "gemm.h"

 namespace ncnn {

 class Gemm_vulkan : virtual public Gemm
 {
 public:
    Gemm_vulkan();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int upload_model(VkTransfer& cmd, const Option& opt);

    using Gemm::forward;
    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

 public:
    Mat A_data_packed;
    Mat B_data_packed;
    Mat C_data_packed;

    VkMat A_data_gpu;
    VkMat B_data_gpu;
    VkMat C_data_gpu;

    VkImageMat A_data_gpu_image;
    VkImageMat B_data_gpu_image;
    VkImageMat C_data_gpu_image;

    Pipeline* pipeline_gemm;
 };

 } // namespace ncnn

 #endif // LAYER_GEMM_VULKAN_H
--- a/src/layer/vulkan/multiheadattention_vulkan.cpp
+++ b/src/layer/vulkan/multiheadattention_vulkan.cpp
@@ -0,0 +1,705 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "multiheadattention_vulkan.h"

 #include "layer_shader_type.h"
 #include "layer_type.h"

 namespace ncnn {

 MultiHeadAttention_vulkan::MultiHeadAttention_vulkan()
 {
    support_vulkan = true;
    support_image_storage = true;

    q_gemm = 0;
    k_gemm = 0;
    v_gemm = 0;

    qk_softmax = 0;

    o_gemm = 0;

    pipeline_multiheadattention_qk_cross = 0;
    pipeline_multiheadattention_qk_cross_pack4 = 0;
    pipeline_multiheadattention_qk_cross_pack1to4 = 0;
    pipeline_multiheadattention_qk_cross_pack4to1 = 0;

    pipeline_multiheadattention_qkv_cross = 0;
    pipeline_multiheadattention_qkv_cross_pack4 = 0;
    pipeline_multiheadattention_qkv_cross_pack1to4 = 0;
    pipeline_multiheadattention_qkv_cross_pack4to1 = 0;
 }

 int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
 {
    const int embed_dim_per_head = embed_dim / num_heads;
    {
        const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head);

        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
        q_gemm->vkdev = vkdev;
        ncnn::ParamDict pd;
        pd.set(0, inv_sqrt_embed_dim_per_head);
        pd.set(1, 1.f);
        pd.set(2, 0);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 1);         // constantA
        pd.set(5, 0);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, embed_dim); // M
        pd.set(8, 0);         // N
        pd.set(9, embed_dim); // K
        pd.set(10, 1);        // constant_broadcast_type_C
        pd.set(11, 0);        // output_N1M
        // pd.set(12, 1);        // output_elempack
        pd.set(14, 0); // output_transpose
        q_gemm->load_param(pd);
        Mat weights[2];
        weights[0] = q_weight_data;
        weights[1] = q_bias_data;
        q_gemm->load_model(ModelBinFromMatArray(weights));
        q_gemm->create_pipeline(opt);
    }

    {
        k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
        k_gemm->vkdev = vkdev;
        ncnn::ParamDict pd;
        pd.set(2, 0);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 1);         // constantA
        pd.set(5, 0);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, embed_dim); // M
        pd.set(8, 0);         // N
        pd.set(9, kdim);      // K
        pd.set(10, 1);        // constant_broadcast_type_C
        pd.set(11, 0);        // output_N1M
        // pd.set(12, 1);        // output_elempack
        pd.set(14, 0); // output_transpose
        k_gemm->load_param(pd);
        Mat weights[2];
        weights[0] = k_weight_data;
        weights[1] = k_bias_data;
        k_gemm->load_model(ModelBinFromMatArray(weights));
        k_gemm->create_pipeline(opt);
    }

    {
        v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
        v_gemm->vkdev = vkdev;
        ncnn::ParamDict pd;
        pd.set(2, 0);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 1);         // constantA
        pd.set(5, 0);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, embed_dim); // M
        pd.set(8, 0);         // N
        pd.set(9, vdim);      // K
        pd.set(10, 1);        // constant_broadcast_type_C
        pd.set(11, 0);        // output_N1M
        // pd.set(12, 1);        // output_elempack
        pd.set(14, 0); // output_transpose
        v_gemm->load_param(pd);
        Mat weights[2];
        weights[0] = v_weight_data;
        weights[1] = v_bias_data;
        v_gemm->load_model(ModelBinFromMatArray(weights));
        v_gemm->create_pipeline(opt);
    }

    {
        std::vector<vk_specialization_type> specializations(4);
        specializations[0].i = 0; //constantM;
        specializations[1].i = 0; //constantN;
        specializations[2].i = 0; //embed_dim_per_head;//constantK;
        specializations[3].i = num_heads;

        {
            pipeline_multiheadattention_qk_cross = new Pipeline(vkdev);
            pipeline_multiheadattention_qk_cross->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qk_cross->create(LayerShaderType::multiheadattention_qk_cross, opt, specializations);
        }
        {
            pipeline_multiheadattention_qk_cross_pack4 = new Pipeline(vkdev);
            pipeline_multiheadattention_qk_cross_pack4->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qk_cross_pack4->create(LayerShaderType::multiheadattention_qk_cross_pack4, opt, specializations);
        }
        {
            pipeline_multiheadattention_qk_cross_pack1to4 = new Pipeline(vkdev);
            pipeline_multiheadattention_qk_cross_pack1to4->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qk_cross_pack1to4->create(LayerShaderType::multiheadattention_qk_cross_pack1to4, opt, specializations);
        }
        {
            pipeline_multiheadattention_qk_cross_pack4to1 = new Pipeline(vkdev);
            pipeline_multiheadattention_qk_cross_pack4to1->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qk_cross_pack4to1->create(LayerShaderType::multiheadattention_qk_cross_pack4to1, opt, specializations);
        }
    }
    {
        std::vector<vk_specialization_type> specializations(4);
        specializations[0].i = 0; //constantM;
        specializations[1].i = 0; //embed_dim_per_head;//constantN;
        specializations[2].i = 0; //constantK;
        specializations[3].i = num_heads;

        {
            pipeline_multiheadattention_qkv_cross = new Pipeline(vkdev);
            pipeline_multiheadattention_qkv_cross->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qkv_cross->create(LayerShaderType::multiheadattention_qkv_cross, opt, specializations);
        }
        {
            pipeline_multiheadattention_qkv_cross_pack4 = new Pipeline(vkdev);
            pipeline_multiheadattention_qkv_cross_pack4->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qkv_cross_pack4->create(LayerShaderType::multiheadattention_qkv_cross_pack4, opt, specializations);
        }
        {
            pipeline_multiheadattention_qkv_cross_pack1to4 = new Pipeline(vkdev);
            pipeline_multiheadattention_qkv_cross_pack1to4->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qkv_cross_pack1to4->create(LayerShaderType::multiheadattention_qkv_cross_pack1to4, opt, specializations);
        }
        {
            pipeline_multiheadattention_qkv_cross_pack4to1 = new Pipeline(vkdev);
            pipeline_multiheadattention_qkv_cross_pack4to1->set_local_size_xyz(8, 8, 1);
            pipeline_multiheadattention_qkv_cross_pack4to1->create(LayerShaderType::multiheadattention_qkv_cross_pack4to1, opt, specializations);
        }
    }

    {
        qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
        qk_softmax->vkdev = vkdev;
        ncnn::ParamDict pd;
        pd.set(0, -1);
        pd.set(1, 1);
        qk_softmax->load_param(pd);
        qk_softmax->load_model(ModelBinFromMatArray(0));
        qk_softmax->create_pipeline(opt);
    }

    {
        o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
        o_gemm->vkdev = vkdev;
        ncnn::ParamDict pd;
        pd.set(2, 1);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 0);         // constantA
        pd.set(5, 1);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, 0);         // M = outch
        pd.set(8, embed_dim); // N = size
        pd.set(9, embed_dim); // K = maxk*inch
        pd.set(10, 4);        // constant_broadcast_type_C
        pd.set(11, 0);        // output_N1M
        o_gemm->load_param(pd);
        Mat weights[2];
        weights[0] = out_weight_data;
        weights[1] = out_bias_data;
        o_gemm->load_model(ModelBinFromMatArray(weights));
        o_gemm->create_pipeline(opt);
    }

    return 0;
 }

 int MultiHeadAttention_vulkan::destroy_pipeline(const Option& opt)
 {
    if (q_gemm)
    {
        q_gemm->destroy_pipeline(opt);
        delete q_gemm;
        q_gemm = 0;
    }

    if (k_gemm)
    {
        k_gemm->destroy_pipeline(opt);
        delete k_gemm;
        k_gemm = 0;
    }

    if (v_gemm)
    {
        v_gemm->destroy_pipeline(opt);
        delete v_gemm;
        v_gemm = 0;
    }

    delete pipeline_multiheadattention_qk_cross;
    pipeline_multiheadattention_qk_cross = 0;

    delete pipeline_multiheadattention_qk_cross_pack4;
    pipeline_multiheadattention_qk_cross_pack4 = 0;

    delete pipeline_multiheadattention_qk_cross_pack1to4;
    pipeline_multiheadattention_qk_cross_pack1to4 = 0;

    delete pipeline_multiheadattention_qk_cross_pack4to1;
    pipeline_multiheadattention_qk_cross_pack4to1 = 0;

    delete pipeline_multiheadattention_qkv_cross;
    pipeline_multiheadattention_qkv_cross = 0;

    delete pipeline_multiheadattention_qkv_cross_pack4;
    pipeline_multiheadattention_qkv_cross_pack4 = 0;

    delete pipeline_multiheadattention_qkv_cross_pack1to4;
    pipeline_multiheadattention_qkv_cross_pack1to4 = 0;

    delete pipeline_multiheadattention_qkv_cross_pack4to1;
    pipeline_multiheadattention_qkv_cross_pack4to1 = 0;

    if (qk_softmax)
    {
        qk_softmax->destroy_pipeline(opt);
        delete qk_softmax;
        qk_softmax = 0;
    }

    if (o_gemm)
    {
        o_gemm->destroy_pipeline(opt);
        delete o_gemm;
        o_gemm = 0;
    }

    return 0;
 }

 int MultiHeadAttention_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
    if (q_gemm)
    {
        q_gemm->upload_model(cmd, opt);
    }

    if (k_gemm)
    {
        k_gemm->upload_model(cmd, opt);
    }

    if (v_gemm)
    {
        v_gemm->upload_model(cmd, opt);
    }

    if (o_gemm)
    {
        o_gemm->upload_model(cmd, opt);
    }

    return 0;
 }

 int MultiHeadAttention_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
 {
    const VkMat& q_blob = bottom_blobs[0];
    const VkMat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1];
    const VkMat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2];

    const int embed_dim_per_head = embed_dim / num_heads;
    const int src_seqlen = q_blob.h * q_blob.elempack;
    const int dst_seqlen = k_blob.h * k_blob.elempack;

    VkMat q_affine;
    q_gemm->forward(q_blob, q_affine, cmd, opt);

    VkMat k_affine;
    k_gemm->forward(k_blob, k_affine, cmd, opt);

    VkMat qk_cross;
    {
        int M = q_affine.w;
        int N = k_affine.w;
        int K = q_affine.h * q_affine.elempack / num_heads;
        int B = num_heads;

        // int K_elempack = opt.use_shader_pack8 && K % 8 == 0 ? 8 : K % 4 == 0 ? 4 : 1;
        // int M_elempack = opt.use_shader_pack8 && M % 8 == 0 ? 8 : M % 4 == 0 ? 4 : 1;
        // int MB_elempack = opt.use_shader_pack8 && (M * B) % 8 == 0 ? 8 : (M * B) % 4 == 0 ? 4 : 1;
        int K_elempack = K % 4 == 0 ? 4 : 1;
        int M_elempack = M % 4 == 0 ? 4 : 1;
        int MB_elempack = (M * B) % 4 == 0 ? 4 : 1;
        size_t M_elemsize = q_affine.elemsize / q_affine.elempack * M_elempack;

        if (opt.use_fp16_packed && !opt.use_fp16_storage)
        {
            if (M_elempack == 8) M_elemsize = 8 * 2u;
            if (M_elempack == 4) M_elemsize = 4 * 2u;
            if (M_elempack == 1) M_elemsize = 4u;
        }

        if (K_elempack < q_affine.elempack)
        {
            VkMat tmp;
            vkdev->convert_packing(q_affine, tmp, K_elempack, cmd, opt);
            q_affine = tmp;
        }
        if (K_elempack < k_affine.elempack)
        {
            VkMat tmp;
            vkdev->convert_packing(k_affine, tmp, K_elempack, cmd, opt);
            k_affine = tmp;
        }

        qk_cross.create(N, M / M_elempack * B, M_elemsize, M_elempack, opt.blob_vkallocator);
        if (qk_cross.empty())
            return -100;

        std::vector<VkMat> bindings(3);
        bindings[0] = q_affine;
        bindings[1] = k_affine;
        bindings[2] = qk_cross;

        std::vector<vk_constant_type> constants(4);
        constants[0].i = M / M_elempack;
        constants[1].i = N;
        constants[2].i = K / K_elempack;
        constants[3].i = B;

        VkMat dispatcher;
        dispatcher.w = N;
        dispatcher.h = M / M_elempack;
        dispatcher.c = B;

        const Pipeline* pipeline = 0;
        if (K_elempack == 1 && M_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qk_cross;
        }
        if (K_elempack == 1 && M_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qk_cross_pack1to4;
        }
        if (K_elempack == 4 && M_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qk_cross_pack4to1;
        }
        if (K_elempack == 4 && M_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qk_cross_pack4;
        }

        cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

        if (MB_elempack > M_elempack)
        {
            VkMat tmp;
            vkdev->convert_packing(qk_cross, tmp, MB_elempack, cmd, opt);
            qk_cross = tmp;
        }
    }

    q_affine.release();
    k_affine.release();

    qk_softmax->forward_inplace(qk_cross, cmd, opt);

    VkMat v_affine;
    v_gemm->forward(v_blob, v_affine, cmd, opt);

    VkMat qkv_cross;
    {
        int M = qk_cross.h * qk_cross.elempack / num_heads;
        int N = v_affine.h * v_affine.elempack / num_heads;
        int K = v_affine.w;
        int B = num_heads;

        // int M_elempack = opt.use_shader_pack8 && M % 8 == 0 ? 8 : M % 4 == 0 ? 4 : 1;
        // int N_elempack = opt.use_shader_pack8 && N % 8 == 0 ? 8 : N % 4 == 0 ? 4 : 1;
        // int NB_elempack = opt.use_shader_pack8 && (N * B) % 8 == 0 ? 8 : (N * B) % 4 == 0 ? 4 : 1;
        int M_elempack = M % 4 == 0 ? 4 : 1;
        int N_elempack = N % 4 == 0 ? 4 : 1;
        int NB_elempack = (N * B) % 4 == 0 ? 4 : 1;
        size_t N_elemsize = v_affine.elemsize / v_affine.elempack * N_elempack;

        if (opt.use_fp16_packed && !opt.use_fp16_storage)
        {
            if (N_elempack == 8) N_elemsize = 8 * 2u;
            if (N_elempack == 4) N_elemsize = 4 * 2u;
            if (N_elempack == 1) N_elemsize = 4u;
        }

        if (M_elempack < qk_cross.elempack)
        {
            VkMat tmp;
            vkdev->convert_packing(qk_cross, tmp, M_elempack, cmd, opt);
            qk_cross = tmp;
        }

        if (N_elempack < v_affine.elempack)
        {
            VkMat tmp;
            vkdev->convert_packing(v_affine, tmp, N_elempack, cmd, opt);
            v_affine = tmp;
        }

        qkv_cross.create(M, N / N_elempack * B, N_elemsize, N_elempack, opt.blob_vkallocator);
        if (qkv_cross.empty())
            return -100;

        std::vector<VkMat> bindings(3);
        bindings[0] = qk_cross;
        bindings[1] = v_affine;
        bindings[2] = qkv_cross;

        std::vector<vk_constant_type> constants(4);
        constants[0].i = M / M_elempack;
        constants[1].i = N / N_elempack;
        constants[2].i = K;
        constants[3].i = B;

        VkMat dispatcher;
        dispatcher.w = N / N_elempack;
        dispatcher.h = M / M_elempack;
        dispatcher.c = B;

        const Pipeline* pipeline = 0;
        if (M_elempack == 1 && N_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qkv_cross;
        }
        if (M_elempack == 1 && N_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qkv_cross_pack1to4;
        }
        if (M_elempack == 4 && N_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qkv_cross_pack4to1;
        }
        if (M_elempack == 4 && N_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qkv_cross_pack4;
        }

        cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

        if (NB_elempack > N_elempack)
        {
            VkMat tmp;
            vkdev->convert_packing(qkv_cross, tmp, NB_elempack, cmd, opt);
            qkv_cross = tmp;
        }
    }

    qk_cross.release();
    v_affine.release();

    o_gemm->forward(qkv_cross, top_blobs[0], cmd, opt);

    return 0;
 }

 int MultiHeadAttention_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
 {
    const VkImageMat& q_blob = bottom_blobs[0];
    const VkImageMat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1];
    const VkImageMat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2];

    const int embed_dim_per_head = embed_dim / num_heads;
    const int src_seqlen = q_blob.h * q_blob.elempack;
    const int dst_seqlen = k_blob.h * k_blob.elempack;

    VkImageMat q_affine;
    q_gemm->forward(q_blob, q_affine, cmd, opt);

    VkImageMat k_affine;
    k_gemm->forward(k_blob, k_affine, cmd, opt);

    VkImageMat qk_cross;
    {
        int M = q_affine.w;
        int N = k_affine.w;
        int K = q_affine.h * q_affine.elempack / num_heads;
        int B = num_heads;

        // int K_elempack = opt.use_shader_pack8 && K % 8 == 0 ? 8 : K % 4 == 0 ? 4 : 1;
        // int M_elempack = opt.use_shader_pack8 && M % 8 == 0 ? 8 : M % 4 == 0 ? 4 : 1;
        // int MB_elempack = opt.use_shader_pack8 && (M * B) % 8 == 0 ? 8 : (M * B) % 4 == 0 ? 4 : 1;
        int K_elempack = K % 4 == 0 ? 4 : 1;
        int M_elempack = M % 4 == 0 ? 4 : 1;
        int MB_elempack = (M * B) % 4 == 0 ? 4 : 1;
        size_t M_elemsize = q_affine.elemsize / q_affine.elempack * M_elempack;

        if (opt.use_fp16_packed && !opt.use_fp16_storage)
        {
            if (M_elempack == 8) M_elemsize = 8 * 2u;
            if (M_elempack == 4) M_elemsize = 4 * 2u;
            if (M_elempack == 1) M_elemsize = 4u;
        }

        if (K_elempack < q_affine.elempack)
        {
            VkImageMat tmp;
            vkdev->convert_packing(q_affine, tmp, K_elempack, cmd, opt);
            q_affine = tmp;
        }
        if (K_elempack < k_affine.elempack)
        {
            VkImageMat tmp;
            vkdev->convert_packing(k_affine, tmp, K_elempack, cmd, opt);
            k_affine = tmp;
        }

        qk_cross.create(N, M / M_elempack * B, M_elemsize, M_elempack, opt.blob_vkallocator);
        if (qk_cross.empty())
            return -100;

        std::vector<VkImageMat> bindings(3);
        bindings[0] = q_affine;
        bindings[1] = k_affine;
        bindings[2] = qk_cross;

        std::vector<vk_constant_type> constants(4);
        constants[0].i = M / M_elempack;
        constants[1].i = N;
        constants[2].i = K / K_elempack;
        constants[3].i = B;

        VkImageMat dispatcher;
        dispatcher.w = N;
        dispatcher.h = M / M_elempack;
        dispatcher.c = B;

        const Pipeline* pipeline = 0;
        if (K_elempack == 1 && M_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qk_cross;
        }
        if (K_elempack == 1 && M_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qk_cross_pack1to4;
        }
        if (K_elempack == 4 && M_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qk_cross_pack4to1;
        }
        if (K_elempack == 4 && M_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qk_cross_pack4;
        }

        cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

        if (MB_elempack > M_elempack)
        {
            VkImageMat tmp;
            vkdev->convert_packing(qk_cross, tmp, MB_elempack, cmd, opt);
            qk_cross = tmp;
        }
    }

    q_affine.release();
    k_affine.release();

    qk_softmax->forward_inplace(qk_cross, cmd, opt);

    VkImageMat v_affine;
    v_gemm->forward(v_blob, v_affine, cmd, opt);

    VkImageMat qkv_cross;
    {
        int M = qk_cross.h * qk_cross.elempack / num_heads;
        int N = v_affine.h * v_affine.elempack / num_heads;
        int K = v_affine.w;
        int B = num_heads;

        // int M_elempack = opt.use_shader_pack8 && M % 8 == 0 ? 8 : M % 4 == 0 ? 4 : 1;
        // int N_elempack = opt.use_shader_pack8 && N % 8 == 0 ? 8 : N % 4 == 0 ? 4 : 1;
        // int NB_elempack = opt.use_shader_pack8 && (N * B) % 8 == 0 ? 8 : (N * B) % 4 == 0 ? 4 : 1;
        int M_elempack = M % 4 == 0 ? 4 : 1;
        int N_elempack = N % 4 == 0 ? 4 : 1;
        int NB_elempack = (N * B) % 4 == 0 ? 4 : 1;
        size_t N_elemsize = v_affine.elemsize / v_affine.elempack * N_elempack;

        if (opt.use_fp16_packed && !opt.use_fp16_storage)
        {
            if (N_elempack == 8) N_elemsize = 8 * 2u;
            if (N_elempack == 4) N_elemsize = 4 * 2u;
            if (N_elempack == 1) N_elemsize = 4u;
        }

        if (M_elempack < qk_cross.elempack)
        {
            VkImageMat tmp;
            vkdev->convert_packing(qk_cross, tmp, M_elempack, cmd, opt);
            qk_cross = tmp;
        }

        if (N_elempack < v_affine.elempack)
        {
            VkImageMat tmp;
            vkdev->convert_packing(v_affine, tmp, N_elempack, cmd, opt);
            v_affine = tmp;
        }

        qkv_cross.create(M, N / N_elempack * B, N_elemsize, N_elempack, opt.blob_vkallocator);
        if (qkv_cross.empty())
            return -100;

        std::vector<VkImageMat> bindings(3);
        bindings[0] = qk_cross;
        bindings[1] = v_affine;
        bindings[2] = qkv_cross;

        std::vector<vk_constant_type> constants(4);
        constants[0].i = M / M_elempack;
        constants[1].i = N / N_elempack;
        constants[2].i = K;
        constants[3].i = B;

        VkImageMat dispatcher;
        dispatcher.w = N / N_elempack;
        dispatcher.h = M / M_elempack;
        dispatcher.c = B;

        const Pipeline* pipeline = 0;
        if (M_elempack == 1 && N_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qkv_cross;
        }
        if (M_elempack == 1 && N_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qkv_cross_pack1to4;
        }
        if (M_elempack == 4 && N_elempack == 1)
        {
            pipeline = pipeline_multiheadattention_qkv_cross_pack4to1;
        }
        if (M_elempack == 4 && N_elempack == 4)
        {
            pipeline = pipeline_multiheadattention_qkv_cross_pack4;
        }

        cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

        if (NB_elempack > N_elempack)
        {
            VkImageMat tmp;
            vkdev->convert_packing(qkv_cross, tmp, NB_elempack, cmd, opt);
            qkv_cross = tmp;
        }
    }

    qk_cross.release();
    v_affine.release();

    o_gemm->forward(qkv_cross, top_blobs[0], cmd, opt);

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/vulkan/multiheadattention_vulkan.h
+++ b/src/layer/vulkan/multiheadattention_vulkan.h
@@ -0,0 +1,57 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_MULTIHEADATTENTION_VULKAN_H
 #define LAYER_MULTIHEADATTENTION_VULKAN_H

 #include "multiheadattention.h"

 namespace ncnn {

 class MultiHeadAttention_vulkan : virtual public MultiHeadAttention
 {
 public:
    MultiHeadAttention_vulkan();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int upload_model(VkTransfer& cmd, const Option& opt);

    using MultiHeadAttention::forward;
    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;

 public:
    Layer* q_gemm;
    Layer* k_gemm;
    Layer* v_gemm;
    Layer* o_gemm;

    Layer* qk_softmax;

    Pipeline* pipeline_multiheadattention_qk_cross;
    Pipeline* pipeline_multiheadattention_qk_cross_pack4;
    Pipeline* pipeline_multiheadattention_qk_cross_pack1to4;
    Pipeline* pipeline_multiheadattention_qk_cross_pack4to1;

    Pipeline* pipeline_multiheadattention_qkv_cross;
    Pipeline* pipeline_multiheadattention_qkv_cross_pack4;
    Pipeline* pipeline_multiheadattention_qkv_cross_pack1to4;
    Pipeline* pipeline_multiheadattention_qkv_cross_pack4to1;
 };

 } // namespace ncnn

 #endif // LAYER_MULTIHEADATTENTION_VULKAN_H
--- a/src/layer/vulkan/shader/gemm.comp
+++ b/src/layer/vulkan/shader/gemm.comp
@@ -0,0 +1,453 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 #define LOCAL_MEMORY_UNROLL_INCH 8

 layout (constant_id = 0) const float alpha = 1.f;
 layout (constant_id = 1) const float beta = 1.f;
 layout (constant_id = 2) const int transA = 0;
 layout (constant_id = 3) const int transB = 0;
 layout (constant_id = 4) const int constantA = 0;
 layout (constant_id = 5) const int constantB = 0;
 layout (constant_id = 6) const int constantC = 0;
 layout (constant_id = 7) const int M = 0;
 layout (constant_id = 8) const int N = 0;
 layout (constant_id = 9) const int K = 0;
 layout (constant_id = 10) const int constant_broadcast_type_C = 0;
 layout (constant_id = 11) const int output_N1M = 0;
 layout (constant_id = 12) const int output_elempack = 0;
 layout (constant_id = 13) const int output_elemtype = 0;
 layout (constant_id = 14) const int output_transpose = 0;

 // TODO psc more

 #if NCNN_image_shader
 layout (binding = 0, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
 layout (binding = 1) uniform unfp sampler3D A_blob_3d;
 layout (binding = 2) uniform unfp sampler3D B_blob_3d;
 layout (binding = 3) uniform unfp sampler3D C_blob_3d;
 #else
 layout (binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; };
 layout (binding = 1) readonly buffer A_blob { sfp A_blob_data[]; };
 layout (binding = 2) readonly buffer B_blob { sfp B_blob_data[]; };
 layout (binding = 3) readonly buffer C_blob { sfp C_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int broadcast_type_C;
    int A_dims;
    int A_hstep;
    int B_dims;
    int B_hstep;
    int outdims;
    int outhstep;
 } p;

 #if NCNN_shader_local_memory
 shared lfp tmp_a[8][LOCAL_MEMORY_UNROLL_INCH][2];
 shared lfp tmp_b[8][LOCAL_MEMORY_UNROLL_INCH][2];
 #endif

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) * 2;
    int gy = int(gl_GlobalInvocationID.y) * 2;
    int gz = int(gl_GlobalInvocationID.z);

 #if !NCNN_shader_local_memory
    if (gx >= psc(N) || gy >= psc(M) || gz >= 1)
        return;
 #endif

    afp sum0 = afp(0.f);
    afp sum1 = afp(0.f);
    afp sum2 = afp(0.f);
    afp sum3 = afp(0.f);

    const int broadcast_type_C = constantC == 1 ? constant_broadcast_type_C : p.broadcast_type_C;

 #if NCNN_image_shader
    if (broadcast_type_C == 0)
    {
        sum0 = image3d_ld1(C_blob_3d, ivec3(0, 0, 0));
        sum1 = sum0;
        sum2 = sum0;
        sum3 = sum0;
    }
    if (broadcast_type_C == 1)
    {
        sum0 = image3d_ld1(C_blob_3d, ivec3(gy, 0, 0));
        sum1 = sum0;
        sum2 = image3d_ld1(C_blob_3d, ivec3(gy + 1, 0, 0));
        sum3 = sum2;
    }
    if (broadcast_type_C == 2)
    {
        sum0 = image3d_ld1(C_blob_3d, ivec3(0, gy, 0));
        sum1 = sum0;
        sum2 = image3d_ld1(C_blob_3d, ivec3(0, gy + 1, 0));
        sum3 = sum2;
    }
    if (broadcast_type_C == 3)
    {
        sum0 = image3d_ld1(C_blob_3d, ivec3(gx, gy, 0));
        sum1 = image3d_ld1(C_blob_3d, ivec3(gx + 1, gy, 0));
        sum2 = image3d_ld1(C_blob_3d, ivec3(gx, gy + 1, 0));
        sum3 = image3d_ld1(C_blob_3d, ivec3(gx + 1, gy + 1, 0));
    }
    if (broadcast_type_C == 4)
    {
        sum0 = image3d_ld1(C_blob_3d, ivec3(gx, 0, 0));
        sum1 = image3d_ld1(C_blob_3d, ivec3(gx + 1, 0, 0));
        sum2 = sum0;
        sum3 = sum1;
    }
 #else
    if (broadcast_type_C == 0)
    {
        sum0 = buffer_ld1(C_blob_data, 0);
        sum1 = sum0;
        sum2 = sum0;
        sum3 = sum0;
    }
    if (broadcast_type_C == 1 || broadcast_type_C == 2)
    {
        sum0 = buffer_ld1(C_blob_data, gy);
        sum1 = sum0;
        sum2 = buffer_ld1(C_blob_data, gy + 1);
        sum3 = sum2;
    }
    if (broadcast_type_C == 3)
    {
        const int ci = gy * psc(N) + gx;
        sum0 = buffer_ld1(C_blob_data, ci);
        sum1 = buffer_ld1(C_blob_data, ci + 1);
        sum2 = buffer_ld1(C_blob_data, ci + psc(N));
        sum3 = buffer_ld1(C_blob_data, ci + psc(N) + 1);
    }
    if (broadcast_type_C == 4)
    {
        sum0 = buffer_ld1(C_blob_data, gx);
        sum1 = buffer_ld1(C_blob_data, gx + 1);
        sum2 = sum0;
        sum3 = sum1;
    }
 #endif

    sum0 *= afp(beta);
    sum1 *= afp(beta);
    sum2 *= afp(beta);
    sum3 *= afp(beta);

 #if !NCNN_image_shader && NCNN_shader_local_memory
    const int NN = psc(K);

    const int lx = int(gl_LocalInvocationID.x);
    const int ly = int(gl_LocalInvocationID.y);

    int k = 0;
    for (; k + (LOCAL_MEMORY_UNROLL_INCH - 1) < NN; k += LOCAL_MEMORY_UNROLL_INCH)
    {
        {
            if (transA == 1)
            {
                const int ai = (k + lx) * p.A_hstep + gy;
                tmp_a[ly][lx][0] = sfp2lfp(A_blob_data[ai]);
                tmp_a[ly][lx][1] = sfp2lfp(A_blob_data[ai + 1]);
            }
            else
            {
                const int ai = gy * p.A_hstep + (k + lx);
                tmp_a[ly][lx][0] = sfp2lfp(A_blob_data[ai]);
                tmp_a[ly][lx][1] = sfp2lfp(A_blob_data[ai + p.A_hstep]);
            }

            if (transB == 1)
            {
                const int bi = gx * p.B_hstep + (k + ly);
                tmp_b[lx][ly][0] = sfp2lfp(B_blob_data[bi]);
                tmp_b[lx][ly][1] = sfp2lfp(B_blob_data[bi + p.B_hstep]);
            }
            else
            {
                const int bi = (k + ly) * p.B_hstep + gx;
                tmp_b[lx][ly][0] = sfp2lfp(B_blob_data[bi]);
                tmp_b[lx][ly][1] = sfp2lfp(B_blob_data[bi + 1]);
            }
        }

        barrier();

        for (int k4 = 0; k4 < LOCAL_MEMORY_UNROLL_INCH; k4++)
        {
            afp a0 = lfp2afp(tmp_a[ly][k4][0]);
            afp a1 = lfp2afp(tmp_a[ly][k4][1]);

            afp b0 = lfp2afp(tmp_b[lx][k4][0]);
            afp b1 = lfp2afp(tmp_b[lx][k4][1]);

            sum0 += a0 * b0;
            sum1 += a0 * b1;
            sum2 += a1 * b0;
            sum3 += a1 * b1;
        }

        barrier();
    }

    if (k < NN)
    {
        const int remain = NN - k;

        if (lx < remain)
        {
            if (transA == 1)
            {
                const int ai = (k + lx) * p.A_hstep + gy;
                tmp_a[ly][lx][0] = sfp2lfp(A_blob_data[ai]);
                tmp_a[ly][lx][1] = sfp2lfp(A_blob_data[ai + 1]);
            }
            else
            {
                const int ai = gy * p.A_hstep + (k + lx);
                tmp_a[ly][lx][0] = sfp2lfp(A_blob_data[ai]);
                tmp_a[ly][lx][1] = sfp2lfp(A_blob_data[ai + p.A_hstep]);
            }
        }

        if (ly < remain)
        {
            if (transB == 1)
            {
                const int bi = gx * p.B_hstep + (k + ly);
                tmp_b[lx][ly][0] = sfp2lfp(B_blob_data[bi]);
                tmp_b[lx][ly][1] = sfp2lfp(B_blob_data[bi + p.B_hstep]);
            }
            else
            {
                const int bi = (k + ly) * p.B_hstep + gx;
                tmp_b[lx][ly][0] = sfp2lfp(B_blob_data[bi]);
                tmp_b[lx][ly][1] = sfp2lfp(B_blob_data[bi + 1]);
            }
        }

        barrier();

        for (int k4 = 0; k4 < remain; k4++)
        {
            afp a0 = lfp2afp(tmp_a[ly][k4][0]);
            afp a1 = lfp2afp(tmp_a[ly][k4][1]);

            afp b0 = lfp2afp(tmp_b[lx][k4][0]);
            afp b1 = lfp2afp(tmp_b[lx][k4][1]);

            sum0 += a0 * b0;
            sum1 += a0 * b1;
            sum2 += a1 * b0;
            sum3 += a1 * b1;
        }
    }
 #else
    for (int k = 0; k < psc(K); k++)
    {
        afp a0;
        afp a1;
        afp b0;
        afp b1;
 #if NCNN_image_shader
        if (transA == 1)
        {
            if (p.A_dims == 3)
            {
                a0 = image3d_ld1(A_blob_3d, ivec3(gy, 0, k));
                a1 = image3d_ld1(A_blob_3d, ivec3(gy + 1, 0, k));
            }
            else
            {
                a0 = image3d_ld1(A_blob_3d, ivec3(gy, k, 0));
                a1 = image3d_ld1(A_blob_3d, ivec3(gy + 1, k, 0));
            }
        }
        else
        {
            if (p.A_dims == 3)
            {
                a0 = image3d_ld1(A_blob_3d, ivec3(k, 0, gy));
                a1 = image3d_ld1(A_blob_3d, ivec3(k, 0, gy + 1));
            }
            else
            {
                a0 = image3d_ld1(A_blob_3d, ivec3(k, gy, 0));
                a1 = image3d_ld1(A_blob_3d, ivec3(k, gy + 1, 0));
            }
        }

        if (transB == 1)
        {
            if (p.B_dims == 3)
            {
                b0 = image3d_ld1(B_blob_3d, ivec3(k, 0, gx));
                b1 = image3d_ld1(B_blob_3d, ivec3(k, 0, gx + 1));
            }
            else
            {
                b0 = image3d_ld1(B_blob_3d, ivec3(k, gx, 0));
                b1 = image3d_ld1(B_blob_3d, ivec3(k, gx + 1, 0));
            }
        }
        else
        {
            if (p.B_dims == 3)
            {
                b0 = image3d_ld1(B_blob_3d, ivec3(gx, 0, k));
                b1 = image3d_ld1(B_blob_3d, ivec3(gx + 1, 0, k));
            }
            else
            {
                b0 = image3d_ld1(B_blob_3d, ivec3(gx, k, 0));
                b1 = image3d_ld1(B_blob_3d, ivec3(gx + 1, k, 0));
            }
        }
 #else
        if (transA == 1)
        {
            const int ai = k * p.A_hstep + gy;
            a0 = buffer_ld1(A_blob_data, ai);
            a1 = buffer_ld1(A_blob_data, ai + 1);
        }
        else
        {
            const int ai = gy * p.A_hstep + k;
            a0 = buffer_ld1(A_blob_data, ai);
            a1 = buffer_ld1(A_blob_data, ai + p.A_hstep);
        }

        if (transB == 1)
        {
            const int bi = gx * p.B_hstep + k;
            b0 = buffer_ld1(B_blob_data, bi);
            b1 = buffer_ld1(B_blob_data, bi + p.B_hstep);
        }
        else
        {
            const int bi = k * p.B_hstep + gx;
            b0 = buffer_ld1(B_blob_data, bi);
            b1 = buffer_ld1(B_blob_data, bi + 1);
        }
 #endif

        sum0 += a0 * b0;
        sum1 += a0 * b1;
        sum2 += a1 * b0;
        sum3 += a1 * b1;
    }
 #endif

 #if NCNN_shader_local_memory
    if (gx >= psc(N) || gy >= psc(M) || gz >= 1)
        return;
 #endif

    sum0 *= afp(alpha);
    sum1 *= afp(alpha);
    sum2 *= afp(alpha);
    sum3 *= afp(alpha);

 #if NCNN_image_shader
    if (output_transpose == 1)
    {
        if (output_N1M == 1)
        {
            image3d_st1(top_blob_3d, ivec3(gy, 0, gx), sum0);
            if (gy + 1 < psc(M)) image3d_st1(top_blob_3d, ivec3(gy + 1, 0, gx), sum2);
            if (gx + 1 < psc(N))
            {
                image3d_st1(top_blob_3d, ivec3(gy, 0, gx + 1), sum1);
                if (gy + 1 < psc(M)) image3d_st1(top_blob_3d, ivec3(gy + 1, 0, gx + 1), sum3);
            }
        }
        else
        {
            image3d_st1(top_blob_3d, ivec3(gy, gx, 0), sum0);
            if (gy + 1 < psc(M)) image3d_st1(top_blob_3d, ivec3(gy + 1, gx, 0), sum2);
            if (gx + 1 < psc(N))
            {
                image3d_st1(top_blob_3d, ivec3(gy, gx + 1, 0), sum1);
                if (gy + 1 < psc(M)) image3d_st1(top_blob_3d, ivec3(gy + 1, gx + 1, 0), sum3);
            }
        }
    }
    else
    {
        if (output_N1M == 1)
        {
            image3d_st1(top_blob_3d, ivec3(gx, 0, gy), sum0);
            if (gx + 1 < psc(N)) image3d_st1(top_blob_3d, ivec3(gx + 1, 0, gy), sum1);
            if (gy + 1 < psc(M))
            {
                image3d_st1(top_blob_3d, ivec3(gx, 0, gy + 1), sum2);
                if (gx + 1 < psc(N)) image3d_st1(top_blob_3d, ivec3(gx + 1, 0, gy + 1), sum3);
            }
        }
        else
        {
            image3d_st1(top_blob_3d, ivec3(gx, gy, 0), sum0);
            if (gx + 1 < psc(N)) image3d_st1(top_blob_3d, ivec3(gx + 1, gy, 0), sum1);
            if (gy + 1 < psc(M))
            {
                image3d_st1(top_blob_3d, ivec3(gx, gy + 1, 0), sum2);
                if (gx + 1 < psc(N)) image3d_st1(top_blob_3d, ivec3(gx + 1, gy + 1, 0), sum3);
            }
        }
    }
 #else
    if (output_transpose == 1)
    {
        const int gi = gx * p.outhstep + gy;

        buffer_st1(top_blob_data, gi, sum0);
        if (gy + 1 < psc(M)) buffer_st1(top_blob_data, gi + 1, sum2);
        if (gx + 1 < psc(N))
        {
            buffer_st1(top_blob_data, gi + p.outhstep, sum1);
            if (gy + 1 < psc(M)) buffer_st1(top_blob_data, gi + p.outhstep + 1, sum3);
        }
    }
    else
    {
        const int gi = gy * p.outhstep + gx;

        buffer_st1(top_blob_data, gi, sum0);
        if (gx + 1 < psc(N)) buffer_st1(top_blob_data, gi + 1, sum1);
        if (gy + 1 < psc(M))
        {
            buffer_st1(top_blob_data, gi + p.outhstep, sum2);
            if (gx + 1 < psc(N)) buffer_st1(top_blob_data, gi + p.outhstep + 1, sum3);
        }
    }
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qk_cross.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qk_cross.comp
@@ -0,0 +1,81 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D q_blob_3d;
 layout (binding = 1) uniform unfp sampler3D k_blob_3d;
 layout (binding = 2, imfmtc1) writeonly uniform unfp image3D qkcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer q_blob { sfp q_blob_data[]; };
 layout (binding = 1) readonly buffer k_blob { sfp k_blob_data[]; };
 layout (binding = 2) writeonly buffer qkcross_blob { sfp qkcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;

    afp sum = afp(0.f);

    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afp q0 = image3d_ld1(q_blob_3d, ivec3(gy, gz * psc(K) + k, 0));

        afp k0 = image3d_ld1(k_blob_3d, ivec3(gx, gz * psc(K) + k, 0));
 #else
        const int ai = gz * psc(M) * psc(K) + k * psc(M) + gy;
        afp q0 = buffer_ld1(q_blob_data, ai);

        const int bi = gz * psc(N) * psc(K) + k * psc(N) + gx;
        afp k0 = buffer_ld1(k_blob_data, bi);
 #endif

        sum += q0 * k0;
    }

 #if NCNN_image_shader
    image3d_st1(qkcross_blob_3d, ivec3(gx, gz * psc(M) + gy, 0), sum);
 #else
    const int gi = gz * psc(M) * psc(N) + gy * psc(N) + gx;
    buffer_st1(qkcross_blob_data, gi, sum);
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qk_cross_pack1to4.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qk_cross_pack1to4.comp
@@ -0,0 +1,90 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D q_blob_3d;
 layout (binding = 1) uniform unfp sampler3D k_blob_3d;
 layout (binding = 2, imfmtc4) writeonly uniform unfp image3D qkcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer q_blob { sfp q_blob_data[]; };
 layout (binding = 1) readonly buffer k_blob { sfp k_blob_data[]; };
 layout (binding = 2) writeonly buffer qkcross_blob { sfpvec4 qkcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;

    afpvec4 sum = afpvec4(0.f);

    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afp q0 = image3d_ld1(q_blob_3d, ivec3(gy * 4, gz * psc(K) + k, 0));
        afp q1 = image3d_ld1(q_blob_3d, ivec3(gy * 4 + 1, gz * psc(K) + k, 0));
        afp q2 = image3d_ld1(q_blob_3d, ivec3(gy * 4 + 2, gz * psc(K) + k, 0));
        afp q3 = image3d_ld1(q_blob_3d, ivec3(gy * 4 + 3, gz * psc(K) + k, 0));

        afp k0 = image3d_ld1(k_blob_3d, ivec3(gx, gz * psc(K) + k, 0));
 #else
        const int ai = (gz * psc(M) * psc(K) + k * psc(M) + gy) * 4;
        afp q0 = buffer_ld1(q_blob_data, ai);
        afp q1 = buffer_ld1(q_blob_data, ai + 1);
        afp q2 = buffer_ld1(q_blob_data, ai + 2);
        afp q3 = buffer_ld1(q_blob_data, ai + 3);

        const int bi = gz * psc(N) * psc(K) + k * psc(N) + gx;
        afp k0 = buffer_ld1(k_blob_data, bi);
 #endif

        sum.r += q0 * k0;
        sum.g += q1 * k0;
        sum.b += q2 * k0;
        sum.a += q3 * k0;
    }

 #if NCNN_image_shader
    image3d_st4(qkcross_blob_3d, ivec3(gx, gz * psc(M) + gy, 0), sum);
 #else
    const int gi = gz * psc(M) * psc(N) + gy * psc(N) + gx;
    buffer_st4(qkcross_blob_data, gi, sum);
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qk_cross_pack4.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qk_cross_pack4.comp
@@ -0,0 +1,186 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 #define LOCAL_MEMORY_UNROLL_INCH 8

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D q_blob_3d;
 layout (binding = 1) uniform unfp sampler3D k_blob_3d;
 layout (binding = 2, imfmtc4) writeonly uniform unfp image3D qkcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer q_blob { sfpvec4 q_blob_data[]; };
 layout (binding = 1) readonly buffer k_blob { sfpvec4 k_blob_data[]; };
 layout (binding = 2) writeonly buffer qkcross_blob { sfpvec4 qkcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 #if NCNN_shader_local_memory
 shared lfpvec4 tmp_q[8][LOCAL_MEMORY_UNROLL_INCH][4];
 shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
 #endif

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

 #if !NCNN_shader_local_memory
    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;
 #endif

    afpvec4 sum = afpvec4(0.f);

 #if !NCNN_image_shader && NCNN_shader_local_memory
    const int NN = psc(K);

    const int lx = int(gl_LocalInvocationID.x);
    const int ly = int(gl_LocalInvocationID.y);

    int ai = (gz * psc(M) * psc(K) + lx * psc(M) + gy) * 4;
    int bi = gz * psc(N) * psc(K) + ly * psc(N) + gx;

    int k = 0;
    for (; k + (LOCAL_MEMORY_UNROLL_INCH - 1) < NN; k += LOCAL_MEMORY_UNROLL_INCH)
    {
        {
            tmp_q[ly][lx][0] = sfp2lfpvec4(q_blob_data[ai]);
            tmp_q[ly][lx][1] = sfp2lfpvec4(q_blob_data[ai + 1]);
            tmp_q[ly][lx][2] = sfp2lfpvec4(q_blob_data[ai + 2]);
            tmp_q[ly][lx][3] = sfp2lfpvec4(q_blob_data[ai + 3]);
        }

        {
            tmp_k[lx][ly] = sfp2lfpvec4(k_blob_data[bi]);
        }

        barrier();

        for (int k4 = 0; k4 < LOCAL_MEMORY_UNROLL_INCH; k4++)
        {
            afpvec4 q0 = lfp2afpvec4(tmp_q[ly][k4][0]);
            afpvec4 q1 = lfp2afpvec4(tmp_q[ly][k4][1]);
            afpvec4 q2 = lfp2afpvec4(tmp_q[ly][k4][2]);
            afpvec4 q3 = lfp2afpvec4(tmp_q[ly][k4][3]);

            afpvec4 k0 = lfp2afpvec4(tmp_k[lx][k4]);

            sum.r += dot(q0, k0);
            sum.g += dot(q1, k0);
            sum.b += dot(q2, k0);
            sum.a += dot(q3, k0);
        }

        ai += LOCAL_MEMORY_UNROLL_INCH * psc(M) * 4;
        bi += LOCAL_MEMORY_UNROLL_INCH * psc(N);

        barrier();
    }

    if (k < NN)
    {
        const int remain = NN - k;

        if (lx < remain)
        {
            tmp_q[ly][lx][0] = sfp2lfpvec4(q_blob_data[ai]);
            tmp_q[ly][lx][1] = sfp2lfpvec4(q_blob_data[ai + 1]);
            tmp_q[ly][lx][2] = sfp2lfpvec4(q_blob_data[ai + 2]);
            tmp_q[ly][lx][3] = sfp2lfpvec4(q_blob_data[ai + 3]);
        }

        if (ly < remain)
        {
            tmp_k[lx][ly] = sfp2lfpvec4(k_blob_data[bi]);
        }

        barrier();

        for (int k4 = 0; k4 < remain; k4++)
        {
            afpvec4 q0 = lfp2afpvec4(tmp_q[ly][k4][0]);
            afpvec4 q1 = lfp2afpvec4(tmp_q[ly][k4][1]);
            afpvec4 q2 = lfp2afpvec4(tmp_q[ly][k4][2]);
            afpvec4 q3 = lfp2afpvec4(tmp_q[ly][k4][3]);

            afpvec4 k0 = lfp2afpvec4(tmp_k[lx][k4]);

            sum.r += dot(q0, k0);
            sum.g += dot(q1, k0);
            sum.b += dot(q2, k0);
            sum.a += dot(q3, k0);
        }
    }
 #else
    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afpvec4 q0 = image3d_ld4(q_blob_3d, ivec3(gy * 4, gz * psc(K) + k, 0));
        afpvec4 q1 = image3d_ld4(q_blob_3d, ivec3(gy * 4 + 1, gz * psc(K) + k, 0));
        afpvec4 q2 = image3d_ld4(q_blob_3d, ivec3(gy * 4 + 2, gz * psc(K) + k, 0));
        afpvec4 q3 = image3d_ld4(q_blob_3d, ivec3(gy * 4 + 3, gz * psc(K) + k, 0));

        afpvec4 k0 = image3d_ld4(k_blob_3d, ivec3(gx, gz * psc(K) + k, 0));
 #else
        const int ai = (gz * psc(M) * psc(K) + k * psc(M) + gy) * 4;
        afpvec4 q0 = buffer_ld4(q_blob_data, ai);
        afpvec4 q1 = buffer_ld4(q_blob_data, ai + 1);
        afpvec4 q2 = buffer_ld4(q_blob_data, ai + 2);
        afpvec4 q3 = buffer_ld4(q_blob_data, ai + 3);

        const int bi = gz * psc(N) * psc(K) + k * psc(N) + gx;
        afpvec4 k0 = buffer_ld4(k_blob_data, bi);
 #endif

        sum.r += dot(q0, k0);
        sum.g += dot(q1, k0);
        sum.b += dot(q2, k0);
        sum.a += dot(q3, k0);
    }
 #endif

 #if NCNN_shader_local_memory
    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;
 #endif

 #if NCNN_image_shader
    image3d_st4(qkcross_blob_3d, ivec3(gx, gz * psc(M) + gy, 0), sum);
 #else
    const int gi = gz * psc(M) * psc(N) + gy * psc(N) + gx;
    buffer_st4(qkcross_blob_data, gi, sum);
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qk_cross_pack4to1.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qk_cross_pack4to1.comp
@@ -0,0 +1,81 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D q_blob_3d;
 layout (binding = 1) uniform unfp sampler3D k_blob_3d;
 layout (binding = 2, imfmtc1) writeonly uniform unfp image3D qkcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer q_blob { sfpvec4 q_blob_data[]; };
 layout (binding = 1) readonly buffer k_blob { sfpvec4 k_blob_data[]; };
 layout (binding = 2) writeonly buffer qkcross_blob { sfp qkcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;

    afp sum = afp(0.f);

    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afpvec4 q0 = image3d_ld4(q_blob_3d, ivec3(gy, gz * psc(K) + k, 0));

        afpvec4 k0 = image3d_ld4(k_blob_3d, ivec3(gx, gz * psc(K) + k, 0));
 #else
        const int ai = gz * psc(M) * psc(K) + k * psc(M) + gy;
        afpvec4 q0 = buffer_ld4(q_blob_data, ai);

        const int bi = gz * psc(N) * psc(K) + k * psc(N) + gx;
        afpvec4 k0 = buffer_ld4(k_blob_data, bi);
 #endif

        sum += dot(q0, k0);
    }

 #if NCNN_image_shader
    image3d_st1(qkcross_blob_3d, ivec3(gx, gz * psc(M) + gy, 0), sum);
 #else
    const int gi = gz * psc(M) * psc(N) + gy * psc(N) + gx;
    buffer_st1(qkcross_blob_data, gi, sum);
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qkv_cross.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qkv_cross.comp
@@ -0,0 +1,81 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D qkcross_blob_3d;
 layout (binding = 1) uniform unfp sampler3D v_blob_3d;
 layout (binding = 2, imfmtc1) writeonly uniform unfp image3D qkvcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer qkcross_blob { sfp qkcross_blob_data[]; };
 layout (binding = 1) readonly buffer v_blob { sfp v_blob_data[]; };
 layout (binding = 2) writeonly buffer qkvcross_blob { sfp qkvcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;

    afp sum = afp(0.f);

    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afp qk0 = image3d_ld1(qkcross_blob_3d, ivec3(k, gz * psc(M) + gy, 0));
        afp v0 = image3d_ld1(v_blob_3d, ivec3(k, gz * psc(N) + gx, 0));
 #else
        const int ai = gz * psc(M) * psc(K) + gy * psc(K) + k;
        afp qk0 = buffer_ld1(qkcross_blob_data, ai);

        const int bi = gz * psc(N) * psc(K) + gx * psc(K) + k;
        afp v0 = buffer_ld1(v_blob_data, bi);
 #endif

        sum += qk0 * v0;
    }

 #if NCNN_image_shader
    image3d_st1(qkvcross_blob_3d, ivec3(gy, gz * psc(N) + gx, 0), sum);
 #else
    const int gi = gz * psc(M) * psc(N) + gx * psc(M) + gy;

    buffer_st1(qkvcross_blob_data, gi, sum);
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qkv_cross_pack1to4.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qkv_cross_pack1to4.comp
@@ -0,0 +1,81 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D qkcross_blob_3d;
 layout (binding = 1) uniform unfp sampler3D v_blob_3d;
 layout (binding = 2, imfmtc4) writeonly uniform unfp image3D qkvcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer qkcross_blob { sfp qkcross_blob_data[]; };
 layout (binding = 1) readonly buffer v_blob { sfpvec4 v_blob_data[]; };
 layout (binding = 2) writeonly buffer qkvcross_blob { sfpvec4 qkvcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;

    afpvec4 sum = afpvec4(0.f);

    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afp qk0 = image3d_ld1(qkcross_blob_3d, ivec3(k, gz * psc(M) + gy, 0));
        afpvec4 v0 = image3d_ld4(v_blob_3d, ivec3(k, gz * psc(N) + gx, 0));
 #else
        const int ai = gz * psc(M) * psc(K) + gy * psc(K) + k;
        afp qk0 = buffer_ld1(qkcross_blob_data, ai);

        const int bi = gz * psc(N) * psc(K) + gx * psc(K) + k;
        afpvec4 v0 = buffer_ld4(v_blob_data, bi);
 #endif

        sum += qk0 * v0;
    }

 #if NCNN_image_shader
    image3d_st4(qkvcross_blob_3d, ivec3(gy, gz * psc(N) + gx, 0), sum);
 #else
    const int gi = gz * psc(M) * psc(N) + gx * psc(M) + gy;

    buffer_st4(qkvcross_blob_data, gi, sum);
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qkv_cross_pack4.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qkv_cross_pack4.comp
@@ -0,0 +1,177 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 #define LOCAL_MEMORY_UNROLL_INCH 8

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D qkcross_blob_3d;
 layout (binding = 1) uniform unfp sampler3D v_blob_3d;
 layout (binding = 2, imfmtc4) writeonly uniform unfp image3D qkvcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer qkcross_blob { sfpvec4 qkcross_blob_data[]; };
 layout (binding = 1) readonly buffer v_blob { sfpvec4 v_blob_data[]; };
 layout (binding = 2) writeonly buffer qkvcross_blob { sfpvec4 qkvcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 #if NCNN_shader_local_memory
 shared lfpvec4 tmp_qk[8][LOCAL_MEMORY_UNROLL_INCH];
 shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH];
 #endif

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

 #if !NCNN_shader_local_memory
    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;
 #endif

    afpvec4 sum0 = afpvec4(0.f);
    afpvec4 sum1 = afpvec4(0.f);
    afpvec4 sum2 = afpvec4(0.f);
    afpvec4 sum3 = afpvec4(0.f);

 #if !NCNN_image_shader && NCNN_shader_local_memory
    const int NN = psc(K);

    const int lx = int(gl_LocalInvocationID.x);
    const int ly = int(gl_LocalInvocationID.y);

    int ai = gz * psc(M) * psc(K) + gy * psc(K) + lx;
    int bi = gz * psc(N) * psc(K) + gx * psc(K) + ly;

    int k = 0;
    for (; k + (LOCAL_MEMORY_UNROLL_INCH - 1) < NN; k += LOCAL_MEMORY_UNROLL_INCH)
    {
        {
            tmp_qk[ly][lx] = sfp2lfpvec4(qkcross_blob_data[ai]);
        }

        {
            tmp_v[lx][ly] = sfp2lfpvec4(v_blob_data[bi]);
        }

        barrier();

        for (int k4 = 0; k4 < LOCAL_MEMORY_UNROLL_INCH; k4++)
        {
            afpvec4 qk0 = lfp2afpvec4(tmp_qk[ly][k4]);

            afpvec4 v0 = lfp2afpvec4(tmp_v[lx][k4]);

            sum0 += qk0.r * v0;
            sum1 += qk0.g * v0;
            sum2 += qk0.b * v0;
            sum3 += qk0.a * v0;
        }

        ai += LOCAL_MEMORY_UNROLL_INCH;
        bi += LOCAL_MEMORY_UNROLL_INCH;

        barrier();
    }

    if (k < NN)
    {
        const int remain = NN - k;

        if (lx < remain)
        {
            tmp_qk[ly][lx] = sfp2lfpvec4(qkcross_blob_data[ai]);
        }

        if (ly < remain)
        {
            tmp_v[lx][ly] = sfp2lfpvec4(v_blob_data[bi]);
        }

        barrier();

        for (int k4 = 0; k4 < remain; k4++)
        {
            afpvec4 qk0 = lfp2afpvec4(tmp_qk[ly][k4]);

            afpvec4 v0 = lfp2afpvec4(tmp_v[lx][k4]);

            sum0 += qk0.r * v0;
            sum1 += qk0.g * v0;
            sum2 += qk0.b * v0;
            sum3 += qk0.a * v0;
        }
    }
 #else
    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afpvec4 qk0 = image3d_ld4(qkcross_blob_3d, ivec3(k, gz * psc(M) + gy, 0));
        afpvec4 v0 = image3d_ld4(v_blob_3d, ivec3(k, gz * psc(N) + gx, 0));
 #else
        const int ai = gz * psc(M) * psc(K) + gy * psc(K) + k;
        afpvec4 qk0 = buffer_ld4(qkcross_blob_data, ai);

        const int bi = gz * psc(N) * psc(K) + gx * psc(K) + k;
        afpvec4 v0 = buffer_ld4(v_blob_data, bi);
 #endif

        sum0 += qk0.r * v0;
        sum1 += qk0.g * v0;
        sum2 += qk0.b * v0;
        sum3 += qk0.a * v0;
    }
 #endif

 #if NCNN_shader_local_memory
    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;
 #endif

 #if NCNN_image_shader
    image3d_st4(qkvcross_blob_3d, ivec3(gy * 4, gz * psc(N) + gx, 0), sum0);
    image3d_st4(qkvcross_blob_3d, ivec3(gy * 4 + 1, gz * psc(N) + gx, 0), sum1);
    image3d_st4(qkvcross_blob_3d, ivec3(gy * 4 + 2, gz * psc(N) + gx, 0), sum2);
    image3d_st4(qkvcross_blob_3d, ivec3(gy * 4 + 3, gz * psc(N) + gx, 0), sum3);
 #else
    const int gi = (gz * psc(M) * psc(N) + gx * psc(M) + gy) * 4;

    buffer_st4(qkvcross_blob_data, gi, sum0);
    buffer_st4(qkvcross_blob_data, gi + 1, sum1);
    buffer_st4(qkvcross_blob_data, gi + 2, sum2);
    buffer_st4(qkvcross_blob_data, gi + 3, sum3);
 #endif
 }
--- a/src/layer/vulkan/shader/multiheadattention_qkv_cross_pack4to1.comp
+++ b/src/layer/vulkan/shader/multiheadattention_qkv_cross_pack4to1.comp
@@ -0,0 +1,87 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int M = 0;
 layout (constant_id = 1) const int N = 0;
 layout (constant_id = 2) const int K = 0;
 layout (constant_id = 3) const int B = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D qkcross_blob_3d;
 layout (binding = 1) uniform unfp sampler3D v_blob_3d;
 layout (binding = 2, imfmtc1) writeonly uniform unfp image3D qkvcross_blob_3d;
 #else
 layout (binding = 0) readonly buffer qkcross_blob { sfpvec4 qkcross_blob_data[]; };
 layout (binding = 1) readonly buffer v_blob { sfp v_blob_data[]; };
 layout (binding = 2) writeonly buffer qkvcross_blob { sfp qkvcross_blob_data[]; };
 #endif

 layout (push_constant) uniform parameter
 {
    int M;
    int N;
    int K;
    int B;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(N) || gy >= psc(M) || gz >= psc(B))
        return;

    afpvec4 sum = afpvec4(0.f);

    for (int k = 0; k < psc(K); k++)
    {
 #if NCNN_image_shader
        afpvec4 qk0 = image3d_ld4(qkcross_blob_3d, ivec3(k, gz * psc(M) + gy, 0));
        afp v0 = image3d_ld1(v_blob_3d, ivec3(k, gz * psc(N) + gx, 0));
 #else
        const int ai = gz * psc(M) * psc(K) + gy * psc(K) + k;
        afpvec4 qk0 = buffer_ld4(qkcross_blob_data, ai);

        const int bi = gz * psc(N) * psc(K) + gx * psc(K) + k;
        afp v0 = buffer_ld1(v_blob_data, bi);
 #endif

        sum += qk0 * v0;
    }

 #if NCNN_image_shader
    image3d_st1(qkvcross_blob_3d, ivec3(gy * 4, gz * psc(N) + gx, 0), sum.r);
    image3d_st1(qkvcross_blob_3d, ivec3(gy * 4 + 1, gz * psc(N) + gx, 0), sum.g);
    image3d_st1(qkvcross_blob_3d, ivec3(gy * 4 + 2, gz * psc(N) + gx, 0), sum.b);
    image3d_st1(qkvcross_blob_3d, ivec3(gy * 4 + 3, gz * psc(N) + gx, 0), sum.a);
 #else
    const int gi = (gz * psc(M) * psc(N) + gx * psc(M) + gy) * 4;

    buffer_st1(qkvcross_blob_data, gi, sum.r);
    buffer_st1(qkvcross_blob_data, gi + 1, sum.g);
    buffer_st1(qkvcross_blob_data, gi + 2, sum.b);
    buffer_st1(qkvcross_blob_data, gi + 3, sum.a);
 #endif
 }
--- a/src/layer/x86/multiheadattention_x86.cpp
+++ b/src/layer/x86/multiheadattention_x86.cpp
@@ -39,7 +39,7 @@ MultiHeadAttention_x86::MultiHeadAttention_x86()
 int MultiHeadAttention_x86::create_pipeline(const Option& opt)
 {
    {
        const int embed_dim_per_head = embed_dim / num_head;
        const int embed_dim_per_head = embed_dim / num_heads;
        const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head);

        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
@@ -271,7 +271,7 @@ int MultiHeadAttention_x86::forward(const std::vector<Mat>& bottom_blobs, std::v
    const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1];
    const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2];

    const int embed_dim_per_head = embed_dim / num_head;
    const int embed_dim_per_head = embed_dim / num_heads;
    const int src_seqlen = q_blob.h * q_blob.elempack;
    const int dst_seqlen = k_blob.h * k_blob.elempack;

@@ -281,9 +281,9 @@ int MultiHeadAttention_x86::forward(const std::vector<Mat>& bottom_blobs, std::v
    Mat k_affine;
    k_gemm->forward(k_blob, k_affine, opt);

    Mat qk_cross(dst_seqlen, src_seqlen * num_head, 4u, opt.blob_allocator);
    Mat qk_cross(dst_seqlen, src_seqlen * num_heads, 4u, opt.blob_allocator);
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < num_head; i++)
    for (int i = 0; i < num_heads; i++)
    {
        std::vector<Mat> qk_bottom_blobs(2);
        qk_bottom_blobs[0] = q_affine.row_range(i * embed_dim_per_head, embed_dim_per_head);
@@ -303,9 +303,9 @@ int MultiHeadAttention_x86::forward(const std::vector<Mat>& bottom_blobs, std::v
    Mat v_affine;
    v_gemm->forward(v_blob, v_affine, opt);

    Mat qkv_cross(src_seqlen, embed_dim_per_head * num_head, 4u, opt.blob_allocator);
    Mat qkv_cross(src_seqlen, embed_dim_per_head * num_heads, 4u, opt.blob_allocator);
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < num_head; i++)
    for (int i = 0; i < num_heads; i++)
    {
        std::vector<Mat> qkv_bottom_blobs(2);
        qkv_bottom_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen);
--- a/tests/test_multiheadattention.cpp
+++ b/tests/test_multiheadattention.cpp
@@ -46,7 +46,7 @@ static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const
    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
    if (ret != 0)
    {
        fprintf(stderr, "test_multiheadattention failed q=(%d %d) k=(%d %d) v=(%d %d)\n", q.w, q.h, k.w, k.h, v.w, v.h);
        fprintf(stderr, "test_multiheadattention failed q=(%d %d) k=(%d %d) v=(%d %d) num_heads=%d kdim=%d vdim=%d\n", q.w, q.h, k.w, k.h, v.w, v.h, num_heads, kdim, vdim);
    }

    return ret;
@@ -82,7 +82,7 @@ static int test_multiheadattention_samekv(const ncnn::Mat& q, const ncnn::Mat& k
    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
    if (ret != 0)
    {
        fprintf(stderr, "test_multiheadattention_samekv failed q=(%d %d) kv=(%d %d)\n", q.w, q.h, kv.w, kv.h);
        fprintf(stderr, "test_multiheadattention_samekv failed q=(%d %d) kv=(%d %d) num_heads=%d kvdim=%d\n", q.w, q.h, kv.w, kv.h, num_heads, kvdim);
    }

    return ret;
@@ -115,7 +115,7 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int num_heads)
    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
    if (ret != 0)
    {
        fprintf(stderr, "test_multiheadattention_sameqkv failed a=(%d %d)\n", a.w, a.h);
        fprintf(stderr, "test_multiheadattention_sameqkv failed a=(%d %d) num_heads=%d\n", a.w, a.h, num_heads);
    }

    return ret;
@@ -124,6 +124,8 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int num_heads)
 static int test_multiheadattention_0()
 {
    return 0
           || test_multiheadattention(RandomMat(62, 66), RandomMat(32, 66), RandomMat(20, 66), 2, 32, 20)
           || test_multiheadattention(RandomMat(26, 64), RandomMat(32, 64), RandomMat(18, 64), 2, 32, 18)
           || test_multiheadattention(RandomMat(64, 128), RandomMat(64, 128), RandomMat(64, 128), 4, 64, 64)
           || test_multiheadattention(RandomMat(64, 127), RandomMat(64, 127), RandomMat(64, 127), 16, 64, 64)
           || test_multiheadattention(RandomMat(16, 128), RandomMat(44, 128), RandomMat(55, 128), 2, 44, 55)
@@ -146,8 +148,8 @@ static int test_multiheadattention_1()
 static int test_multiheadattention_2()
 {
    return 0
           || test_multiheadattention_sameqkv(RandomMat(64, 128), 8)
           || test_multiheadattention_sameqkv(RandomMat(64, 127), 32);
           || test_multiheadattention_sameqkv(RandomMat(64, 128), 4)
           || test_multiheadattention_sameqkv(RandomMat(64, 127), 8);
 }

 int main()
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -1922,7 +1922,7 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            ncnn::MultiHeadAttention* op_default = (ncnn::MultiHeadAttention*)layer_default;

            fprintf_param_value(" 0=%d", embed_dim)
            fprintf_param_value(" 1=%d", num_head)
            fprintf_param_value(" 1=%d", num_heads)
            fprintf_param_value(" 2=%d", weight_data_size)
            fprintf_param_value(" 3=%d", kdim)
            fprintf_param_value(" 4=%d", vdim)