diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml
index 7ccc10656..e0800bb6b 100644
--- a/.ci/test-coverage.yml
+++ b/.ci/test-coverage.yml
@@ -52,14 +52,14 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: swiftshader-install
-        cacheKey: swiftshader-linux-install-20230420-1
+        cacheKey: swiftshader-linux-install-20240622
 
     - name: checkout-swiftshader
       if: steps.cache-swiftshader.outputs.cacheHit != 'true'
       checkout: https://github.com/google/swiftshader.git
       with:
         pullType: COMMIT_ID
-        refName: dd55e592406dc0bae219df11adec6363840aff4a
+        refName: de870ac7518fe2b6bb651ecc22fc36647cf7b986
         localPath: swiftshader
         enableSubmodule: false
         enableGitLfs: false
diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml
index 8ab7e6ae9..0d0ba34f2 100644
--- a/.github/workflows/linux-x64-gpu-clang.yml
+++ b/.github/workflows/linux-x64-gpu-clang.yml
@@ -46,14 +46,14 @@ jobs:
       uses: actions/cache@v4
       with:
         path: swiftshader-install
-        key: swiftshader-linux-install-20230420
+        key: swiftshader-linux-install-20240622
     - name: checkout-swiftshader
       if: steps.cache-swiftshader.outputs.cache-hit != 'true'
       uses: actions/checkout@v4
       with:
         repository: google/swiftshader
         path: swiftshader
-        ref: dd55e592406dc0bae219df11adec6363840aff4a
+        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
     - name: checkout-swiftshader-submodules
       if: steps.cache-swiftshader.outputs.cache-hit != 'true'
       run: |
diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml
index 55eb9ff87..27052a9e4 100644
--- a/.github/workflows/linux-x64-gpu-gcc.yml
+++ b/.github/workflows/linux-x64-gpu-gcc.yml
@@ -46,14 +46,14 @@ jobs:
       uses: actions/cache@v4
       with:
         path: swiftshader-install
-        key: swiftshader-linux-install-20230420
+        key: swiftshader-linux-install-20240622
     - name: checkout-swiftshader
       if: steps.cache-swiftshader.outputs.cache-hit != 'true'
       uses: actions/checkout@v4
       with:
         repository: google/swiftshader
         path: swiftshader
-        ref: dd55e592406dc0bae219df11adec6363840aff4a
+        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
     - name: checkout-swiftshader-submodules
       if: steps.cache-swiftshader.outputs.cache-hit != 'true'
       run: |
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index e4df7880a..160fe0faf 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -137,14 +137,14 @@ jobs:
       uses: actions/cache@v4
       with:
         path: swiftshader-install
-        key: swiftshader-macos-install-20230420
+        key: swiftshader-macos-install-20240622
     - name: checkout-swiftshader
       if: steps.cache-swiftshader.outputs.cache-hit != 'true'
       uses: actions/checkout@v4
       with:
         repository: google/swiftshader
         path: swiftshader
-        ref: dd55e592406dc0bae219df11adec6363840aff4a
+        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
     - name: checkout-swiftshader-submodules
       if: steps.cache-swiftshader.outputs.cache-hit != 'true'
       run: |
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index e8c5ee384..ff73510d2 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -56,14 +56,14 @@ jobs:
       uses: actions/cache@v4
       with:
         path: swiftshader-install
-        key: swiftshader-linux-install-20230420
+        key: swiftshader-linux-install-20240622
     - name: checkout-swiftshader
       if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true'
       uses: actions/checkout@v4
       with:
         repository: google/swiftshader
         path: swiftshader
-        ref: dd55e592406dc0bae219df11adec6363840aff4a
+        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
     - name: checkout-swiftshader-submodules
       if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true'
       run: |
diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml
index 3695bbed2..1384cc43d 100644
--- a/.github/workflows/test-coverage.yml
+++ b/.github/workflows/test-coverage.yml
@@ -52,11 +52,62 @@ jobs:
         lcov -r lcov.info '*/install/*' -o lcov.info
         lcov -r lcov.info '*/build/*' -o lcov.info
         lcov --list lcov.info
+
     - name: codecov
+      id: codecov
+      continue-on-error: true
+      uses: codecov/codecov-action@v3
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: build/lcov.info
+    - name: codecov-vlen256-retry-1
+      continue-on-error: true
+      id: codecov-vlen256-retry-1
+      if: steps.codecov.outcome=='failure'
+      uses: codecov/codecov-action@v3
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: build/lcov.info
+    - name: codecov-vlen256-retry-2
+      continue-on-error: true
+      id: codecov-vlen256-retry-2
+      if: steps.codecov-vlen256-retry-1.outcome=='failure'
+      uses: codecov/codecov-action@v3
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: build/lcov.info
+    - name: codecov-vlen256-retry-3
+      continue-on-error: true
+      id: codecov-vlen256-retry-3
+      if: steps.codecov-vlen256-retry-2.outcome=='failure'
+      uses: codecov/codecov-action@v3
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: build/lcov.info
+    - name: codecov-vlen256-retry-4
+      continue-on-error: true
+      id: codecov-vlen256-retry-4
+      if: steps.codecov-vlen256-retry-3.outcome=='failure'
       uses: codecov/codecov-action@v3
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
         file: build/lcov.info
+    - name: codecov-vlen256-retry-5
+      continue-on-error: true
+      id: codecov-vlen256-retry-5
+      if: steps.codecov-vlen256-retry-4.outcome=='failure'
+      uses: codecov/codecov-action@v3
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: build/lcov.info
+    - name: set the status
+      if: always()
+      run: |
+        if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then
+           echo fine
+        else
+           exit 1
+        fi
 
   linux-gcc-x64-avx512-spr:
     runs-on: ubuntu-22.04
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index b171e1c8c..d099290a3 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -79,26 +79,26 @@ jobs:
         cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: cache-swiftshader
-      if: matrix.vs-version != 'vs2015'
+      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017'
       id: cache-swiftshader
       uses: actions/cache@v4
       with:
         path: swiftshader-install
-        key: swiftshader-${{ matrix.vs-version }}-x64-install-20230420
+        key: swiftshader-${{ matrix.vs-version }}-x64-install-20240622
     - name: checkout-swiftshader
-      if: matrix.vs-version != 'vs2015' && steps.cache-swiftshader.outputs.cache-hit != 'true'
+      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true'
       uses: actions/checkout@v4
       with:
         repository: google/swiftshader
         path: swiftshader
-        ref: dd55e592406dc0bae219df11adec6363840aff4a
+        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
     - name: checkout-swiftshader-submodules
-      if: matrix.vs-version != 'vs2015' && steps.cache-swiftshader.outputs.cache-hit != 'true'
+      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true'
       run: |
         cd swiftshader
         git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
     - name: swiftshader
-      if: matrix.vs-version != 'vs2015' && steps.cache-swiftshader.outputs.cache-hit != 'true'
+      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true'
       run: |
         cd swiftshader
         mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}
@@ -113,7 +113,7 @@ jobs:
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..
         cmake --build . --config Release -j 4
     - name: x64-test
-      if: matrix.vs-version != 'vs2015'
+      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017'
       run: |
         echo "[Processor]`nThreadCount=1`n" > build-x64/tests/Release/SwiftShader.ini
         Copy-Item -Path "$env:GITHUB_WORKSPACE\swiftshader-install\vulkan-1.dll" -Destination 'build-x64\tests'
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 6056c277b..05996f8d7 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -1266,21 +1266,22 @@ y = affine(out)
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | embed_dim     | int   | 0         |                   |
 | 1         | num_heads     | int   | 1         |                   |
-| 2         | weight_data_size| int | 0         |                   |
+| 2         | weight_data_size| int | 0         | qdim = weight_data_size / embed_dim |
 | 3         | kdim          | int   | embed_dim |                   |
 | 4         | vdim          | int   | embed_dim |                   |
 | 5         | attn_mask     | int   | 0         |                   |
+| 6         | scale         | float | 1.f / sqrt(embed_dim / num_heads) | |
 
 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
-| q_weight_data | float/fp16/int8 | [weight_data_size] |
+| q_weight_data | float/fp16/int8 | [embed_dim * qdim] |
 | q_bias_data   | float | [embed_dim]           |
 | k_weight_data | float/fp16/int8 | [embed_dim * kdim] |
 | k_bias_data   | float | [embed_dim]           |
 | v_weight_data | float/fp16/int8 | [embed_dim * vdim] |
 | v_bias_data   | float | [embed_dim]           |
-| out_weight_data| float/fp16/int8 | [weight_data_size] |
-| out_bias_data | float | [embed_dim]           |
+| out_weight_data| float/fp16/int8 | [qdim * embed_dim] |
+| out_bias_data | float | [qdim]                |
 
 # MVN
 ```
diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp
index f5826ddae..9fedf8b16 100644
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -60,12 +60,9 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     const int qdim = weight_data_size / embed_dim;
 
     {
-        const int embed_dim_per_head = embed_dim / num_heads;
-        const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
-
         q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
-        pd.set(0, inv_sqrt_embed_dim_per_head);
+        pd.set(0, scale);
         pd.set(1, 1.f);
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp
index 284801a2c..e25eec88a 100644
--- a/src/layer/multiheadattention.cpp
+++ b/src/layer/multiheadattention.cpp
@@ -30,6 +30,7 @@ int MultiHeadAttention::load_param(const ParamDict& pd)
     kdim = pd.get(3, embed_dim);
     vdim = pd.get(4, embed_dim);
     attn_mask = pd.get(5, 0);
+    scale = pd.get(6, 1.f / sqrtf(embed_dim / num_heads));
 
     return 0;
 }
@@ -111,12 +112,10 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
     if (xqkv.empty())
         return -100;
 
-    const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
-
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int q = 0; q < num_heads; q++)
     {
-        // xq = affine(q) * inv_sqrt_embed_dim_per_head
+        // xq = affine(q) * scale
         {
             Mat outm = xq.channel(q);
 
@@ -135,7 +134,7 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
                         sum += *ptr++ * *kptr++;
                     }
 
-                    outptr[j] = sum * inv_sqrt_embed_dim_per_head;
+                    outptr[j] = sum * scale;
                 }
             }
         }
diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h
index 50c8549ac..55764bd9c 100644
--- a/src/layer/multiheadattention.h
+++ b/src/layer/multiheadattention.h
@@ -37,6 +37,7 @@ public:
     int kdim;
     int vdim;
     int attn_mask;
+    float scale;
 
     Mat q_weight_data;
     Mat q_bias_data;
diff --git a/src/layer/vulkan/multiheadattention_vulkan.cpp b/src/layer/vulkan/multiheadattention_vulkan.cpp
index f1d7ce363..1abc09c30 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.cpp
+++ b/src/layer/vulkan/multiheadattention_vulkan.cpp
@@ -48,12 +48,10 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     const int embed_dim_per_head = embed_dim / num_heads;
     const int qdim = weight_data_size / embed_dim;
     {
-        const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
-
         q_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         q_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
-        pd.set(0, inv_sqrt_embed_dim_per_head);
+        pd.set(0, scale);
         pd.set(1, 1.f);
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp
index db5f730ae..9bddb3a78 100644
--- a/src/layer/x86/multiheadattention_x86.cpp
+++ b/src/layer/x86/multiheadattention_x86.cpp
@@ -41,12 +41,9 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     const int qdim = weight_data_size / embed_dim;
 
     {
-        const int embed_dim_per_head = embed_dim / num_heads;
-        const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
-
         q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
-        pd.set(0, inv_sqrt_embed_dim_per_head);
+        pd.set(0, scale);
         pd.set(1, 1.f);
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp
index c509f8156..5f110284c 100644
--- a/tests/test_multiheadattention.cpp
+++ b/tests/test_multiheadattention.cpp
@@ -106,6 +106,7 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int embed_dim, in
     pd.set(2, embed_dim * qdim);
     pd.set(3, qdim);
     pd.set(4, qdim);
+    pd.set(6, 0.7f / sqrtf(embed_dim / num_heads));
 
     std::vector<ncnn::Mat> weights(8);
     weights[0] = RandomMat(embed_dim * qdim);