update ncnnoptimize layers, lightmode=false keeps original weight (#5414)

2 years ago · db035d602d
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -30,8 +30,10 @@
 * [Dropout](#dropout)
 * [Eltwise](#eltwise)
 * [ELU](#elu)
 * [Embed](#embed)
 * [Exp](#exp)
 * [Flatten](#flatten)
 * [Fold](#fold)
 * [GELU](#gelu)
 * [GLU](#glu)
 * [Gemm](#gemm)
@@ -84,6 +86,7 @@
 * [Threshold](#threshold)
 * [Tile](#tile)
 * [UnaryOp](#unaryop)
 * [Unfold](#unfold)

 # AbsVal
 ```
@@ -474,12 +477,15 @@ y = crop(x)
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | woffset       | int   | 0         |                   |
 | 1         | hoffset       | int   | 0         |                   |
 | 2         | coffset       | int   | 1         |                   |
 | 3         | outw          | int   | 1         |                   |
 | 13        | doffset       | int   | 0         |                   |
 | 2         | coffset       | int   | 0         |                   |
 | 3         | outw          | int   | 0         |                   |
 | 4         | outh          | int   | 0         |                   |
 | 14        | outd          | int   | 0         |                   |
 | 5         | outc          | int   | 0         |                   |
 | 6         | woffset2      | int   | 0         |                   |
 | 7         | hoffset2      | int   | 1         |                   |
 | 7         | hoffset2      | int   | 0         |                   |
 | 15        | doffset2      | int   | 0         |                   |
 | 8         | coffset2      | int   | 0         |                   |
 | 9         | starts        | array | [ ]       |                   |
 | 10        | ends          | array | [ ]       |                   |
@@ -819,6 +825,23 @@ else        y = x
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | alpha         | float | 0.1f      |                   |

 # Embed
 ```
 y = embedding(x)
 ```

 | param id  | name          | type  | default   | description       |
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | num_output    | int   | 0         |                   |
 | 1         | input_dim     | int   | 0         |                   |
 | 2         | bias_term     | int   | 0         |                   |
 | 3         | weight_data_size | int | 0        |                   |

 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
 | weight_data   | float | [weight_data_size]    |
 | bias_term     | float | [num_output]          |

 # Exp
 ```
 if base == -1   y = exp(shift + x * scale)
@@ -839,6 +862,29 @@ Reshape blob to 1 dimension

 * one_blob_only

 # Fold
 ```
 y = fold(x)
 ```

 * one_blob_only

 | param id  | name          | type  | default   | description       |
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | num_output    | int   | 0         |                   |
 | 1         | kernel_w      | int   | 0         |                   |
 | 2         | dilation_w    | int   | 1         |                   |
 | 3         | stride_w      | int   | 1         |                   |
 | 4         | pad_left      | int   | 0         |                   |
 | 11        | kernel_h      | int   | kernel_w  |                   |
 | 12        | dilation_h    | int   | dilation_w |                  |
 | 13        | stride_h      | int   | stride_w  |                   |
 | 14        | pad_top       | int   | pad_left  |                   |
 | 15        | pad_right     | int   | pad_left  |                   |
 | 16        | pad_bottom    | int   | pad_top   |                   |
 | 20        | output_w      | int   | 0         |                   |
 | 21        | output_h      | int   | output_w  |                   |

 # GELU
 ```
 if fast_gelu == 1   y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x)));
@@ -1187,6 +1233,7 @@ y = data
 | 1         | h             | int   | 0         |                   |
 | 11        | d             | int   | 0         |                   |
 | 2         | c             | int   | 0         |                   |
 | 21        | load_type     | int   | 1         | 1=fp32            |

 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
@@ -1537,6 +1584,7 @@ y = reduce_op(x * coeff)
 | 2         | coeff         | float | 1.f       |                   |
 | 3         | axes          | array | [ ]       |                   |
 | 4         | keepdims      | int   | 0         |                   |
 | 5         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |

 Operation type:
 - 0 = SUM
@@ -1829,3 +1877,24 @@ Operation type:
 - 17 = LOG10
 - 18 = ROUND
 - 19 = TRUNC

 # Unfold
 ```
 y = unfold(x)
 ```

 * one_blob_only

 | param id  | name          | type  | default   | description       |
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | num_output    | int   | 0         |                   |
 | 1         | kernel_w      | int   | 0         |                   |
 | 2         | dilation_w    | int   | 1         |                   |
 | 3         | stride_w      | int   | 1         |                   |
 | 4         | pad_left      | int   | 0         |                   |
 | 11        | kernel_h      | int   | kernel_w  |                   |
 | 12        | dilation_h    | int   | dilation_w |                  |
 | 13        | stride_h      | int   | stride_w  |                   |
 | 14        | pad_top       | int   | pad_left  |                   |
 | 15        | pad_right     | int   | pad_left  |                   |
 | 16        | pad_bottom    | int   | pad_top   |                   |
--- a/src/layer/arm/convolution1d_arm.cpp
+++ b/src/layer/arm/convolution1d_arm.cpp
@@ -68,7 +68,8 @@ int Convolution1D_arm::create_pipeline(const Option& opt)

    convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -233,13 +234,14 @@ int Convolution1D_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector
 }

 #if NCNN_BF16
 int Convolution1D_arm::create_pipeline_bf16s(const Option& /*opt*/)
 int Convolution1D_arm::create_pipeline_bf16s(const Option& opt)
 {
    const int num_input = weight_data_size / kernel_w / num_output;

    convolution1d_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/convolution1d_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution1d_arm_asimdhp.cpp
@@ -36,7 +36,8 @@ int Convolution1D_arm::create_pipeline_fp16s(const Option& opt)

    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -194,7 +194,8 @@ int Convolution_arm::create_pipeline(const Option& opt)

        convolution_dilation1->create_pipeline(opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -224,7 +225,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
        else
            conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -270,7 +272,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
    {
        convolution_im2col_gemm_transform_kernel(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -305,7 +308,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
        convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -904,7 +908,8 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
        else
            conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -950,7 +955,8 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
    {
        convolution_im2col_gemm_transform_kernel_bf16s(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -971,7 +977,8 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
        convolution_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -1284,7 +1291,8 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/convolution_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution_arm_asimdhp.cpp
@@ -108,7 +108,8 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
        else
            conv3x3s1_winograd23_transform_kernel_fp16sa(weight_data, weight_winograd23_data, num_input, num_output, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        if (opt.use_fp16_arithmetic)
        {
@@ -189,7 +190,8 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)

        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -219,7 +221,8 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -119,7 +119,8 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
                ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt);
            }

            weight_data.release();
            if (opt.lightmode)
                weight_data.release();

            return 0;
        }
@@ -161,7 +162,8 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
            }
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -169,7 +171,8 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -1022,7 +1025,8 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
            weight_data_tm = weight_data;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -1030,7 +1034,8 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
@@ -76,7 +76,8 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)

        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -84,7 +85,8 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -211,7 +211,8 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -954,7 +955,8 @@ int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/deconvolution_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp
@@ -154,7 +154,8 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)

    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -104,7 +104,8 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
                ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_tm, opt);
            }

            weight_data.release();
            if (opt.lightmode)
                weight_data.release();

            return 0;
        }
@@ -190,7 +191,8 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
@@ -145,7 +145,8 @@ int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -4201,7 +4201,8 @@ int Gemm_arm::create_pipeline(const Option& opt)
            }
        }

        A_data.release();
        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
@@ -4241,7 +4242,8 @@ int Gemm_arm::create_pipeline(const Option& opt)
            }
        }

        B_data.release();
        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
@@ -4271,7 +4273,8 @@ int Gemm_arm::create_pipeline(const Option& opt)
            CT_data = C2;
        }

        C_data.release();
        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
@@ -4889,7 +4892,8 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
            }
        }

        A_data.release();
        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
@@ -4929,7 +4933,8 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
            }
        }

        B_data.release();
        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
@@ -4959,7 +4964,8 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
            CT_data = C2;
        }

        C_data.release();
        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2736,7 +2736,8 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
            }
        }

        A_data.release();
        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
@@ -2776,7 +2777,8 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
            }
        }

        B_data.release();
        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
@@ -2802,7 +2804,8 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
            }
        }

        C_data.release();
        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
--- a/src/layer/arm/gemm_arm_vfpv4.cpp
+++ b/src/layer/arm/gemm_arm_vfpv4.cpp
@@ -427,7 +427,8 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
            }
        }

        A_data.release();
        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
@@ -467,7 +468,8 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
            }
        }

        B_data.release();
        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
@@ -497,7 +499,8 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
            CT_data = C2;
        }

        C_data.release();
        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
--- a/src/layer/arm/gru_arm.cpp
+++ b/src/layer/arm/gru_arm.cpp
@@ -250,9 +250,12 @@ int GRU_arm::create_pipeline(const Option& opt)
        }
    }

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
@@ -1372,9 +1375,12 @@ int GRU_arm::create_pipeline_bf16s(const Option& opt)
        }
    }

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/arm/gru_arm_asimdhp.cpp
+++ b/src/layer/arm/gru_arm_asimdhp.cpp
@@ -914,9 +914,12 @@ int GRU_arm::create_pipeline_fp16s(const Option& opt)
        }
    }

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -122,7 +122,8 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
        weight_data_tm = weight_data;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -865,7 +866,8 @@ int InnerProduct_arm::create_pipeline_bf16s(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -1258,7 +1260,8 @@ int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/innerproduct_arm_vfpv4.cpp
+++ b/src/layer/arm/innerproduct_arm_vfpv4.cpp
@@ -41,7 +41,8 @@ int InnerProduct_arm::create_pipeline_fp16s(const Option& opt)
    }
 #endif

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -124,9 +124,12 @@ int LSTM_arm::create_pipeline(const Option& opt)
        }
    }

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
@@ -928,9 +931,12 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
        }
    }

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/arm/lstm_arm_asimdhp.cpp
+++ b/src/layer/arm/lstm_arm_asimdhp.cpp
@@ -835,9 +835,12 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
        }
    }

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -84,8 +84,11 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
        q_gemm->load_model(ModelBinFromMatArray(weights));
        q_gemm->create_pipeline(opt);

        q_weight_data.release();
        q_bias_data.release();
        if (opt.lightmode)
        {
            q_weight_data.release();
            q_bias_data.release();
        }
    }

    {
@@ -110,8 +113,11 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
        k_gemm->load_model(ModelBinFromMatArray(weights));
        k_gemm->create_pipeline(opt);

        k_weight_data.release();
        k_bias_data.release();
        if (opt.lightmode)
        {
            k_weight_data.release();
            k_bias_data.release();
        }
    }

    {
@@ -136,8 +142,11 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
        v_gemm->load_model(ModelBinFromMatArray(weights));
        v_gemm->create_pipeline(opt);

        v_weight_data.release();
        v_bias_data.release();
        if (opt.lightmode)
        {
            v_weight_data.release();
            v_bias_data.release();
        }
    }

    {
@@ -160,8 +169,11 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
        o_gemm->load_model(ModelBinFromMatArray(weights));
        o_gemm->create_pipeline(opt);

        out_weight_data.release();
        out_bias_data.release();
        if (opt.lightmode)
        {
            out_weight_data.release();
            out_bias_data.release();
        }
    }

    {
--- a/src/layer/arm/rnn_arm.cpp
+++ b/src/layer/arm/rnn_arm.cpp
@@ -139,9 +139,12 @@ int RNN_arm::create_pipeline(const Option& opt)

    bias_c_data_packed = bias_c_data;

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
@@ -736,9 +739,12 @@ int RNN_arm::create_pipeline_bf16s(const Option& opt)

    cast_float32_to_bfloat16(bias_c_data, bias_c_data_packed, opt);

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/arm/rnn_arm_asimdhp.cpp
+++ b/src/layer/arm/rnn_arm_asimdhp.cpp
@@ -517,9 +517,12 @@ int RNN_arm::create_pipeline_fp16s(const Option& opt)

    cast_float32_to_float16(bias_c_data, bias_c_data_packed, opt);

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/loongarch/convolution1d_loongarch.cpp
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -78,6 +78,9 @@ int Convolution1D_loongarch::create_pipeline(const Option& opt)
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
 }

@@ -281,7 +284,7 @@ int Convolution1D_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, cons
                        sum = bias_data[p];
                    }

                    const float* kptr = (const float*)weight_data + kernel_w * h * p;
                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
--- a/src/layer/loongarch/convolution_loongarch.cpp
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -225,7 +225,8 @@ int Convolution_loongarch::create_pipeline(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -789,7 +790,8 @@ int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
@@ -83,7 +83,8 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
            weight_data_tm = weight_data;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -91,7 +92,8 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -600,7 +602,8 @@ int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option&
            weight_data_tm = weight_data;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -608,7 +611,8 @@ int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option&
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/loongarch/deconvolution_loongarch.cpp
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -126,7 +126,8 @@ int Deconvolution_loongarch::create_pipeline(const Option& opt)
    {
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -82,7 +82,8 @@ int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
            weight_data_tm = weight_data_transposed;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -90,7 +91,8 @@ int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/loongarch/innerproduct_loongarch.cpp
+++ b/src/layer/loongarch/innerproduct_loongarch.cpp
@@ -99,7 +99,8 @@ int InnerProduct_loongarch::create_pipeline(const Option& opt)
        weight_data_tm = weight_data;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -652,7 +653,8 @@ int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt)
        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -1140,7 +1142,8 @@ int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -78,6 +78,9 @@ int Convolution1D_mips::create_pipeline(const Option& opt)
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
 }

@@ -281,7 +284,7 @@ int Convolution1D_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
                        sum = bias_data[p];
                    }

                    const float* kptr = (const float*)weight_data + kernel_w * h * p;
                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -225,7 +225,8 @@ int Convolution_mips::create_pipeline(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -789,7 +790,8 @@ int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/mips/convolutiondepthwise_mips.cpp
+++ b/src/layer/mips/convolutiondepthwise_mips.cpp
@@ -83,7 +83,8 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
            weight_data_tm = weight_data;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -91,7 +92,8 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -600,7 +602,8 @@ int ConvolutionDepthWise_mips::create_pipeline_int8_mips(const Option& opt)
            weight_data_tm = weight_data;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -608,7 +611,8 @@ int ConvolutionDepthWise_mips::create_pipeline_int8_mips(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/mips/deconvolution_mips.cpp
+++ b/src/layer/mips/deconvolution_mips.cpp
@@ -126,7 +126,8 @@ int Deconvolution_mips::create_pipeline(const Option& opt)
    {
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/mips/deconvolutiondepthwise_mips.cpp
+++ b/src/layer/mips/deconvolutiondepthwise_mips.cpp
@@ -82,7 +82,8 @@ int DeconvolutionDepthWise_mips::create_pipeline(const Option& opt)
            weight_data_tm = weight_data_transposed;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -90,7 +91,8 @@ int DeconvolutionDepthWise_mips::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/mips/innerproduct_mips.cpp
+++ b/src/layer/mips/innerproduct_mips.cpp
@@ -99,7 +99,8 @@ int InnerProduct_mips::create_pipeline(const Option& opt)
        weight_data_tm = weight_data;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -652,7 +653,8 @@ int InnerProduct_mips::create_pipeline_fp16s(const Option& opt)
        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -1140,7 +1142,8 @@ int InnerProduct_mips::create_pipeline_int8_mips(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -95,6 +95,9 @@ int Convolution1D_riscv::create_pipeline(const Option& opt)
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
 }

@@ -308,7 +311,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op
                        sum = bias_data[p];
                    }

                    const float* kptr = (const float*)weight_data + kernel_w * h * p;
                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
@@ -470,7 +473,8 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)

    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -237,7 +237,8 @@ int Convolution_riscv::create_pipeline(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -834,7 +835,8 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -104,7 +104,8 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
            weight_data_tm = weight_data;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -112,7 +113,8 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -682,7 +684,8 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)

        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -690,7 +693,8 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -148,7 +148,8 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
    {
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -530,7 +531,8 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)

    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -97,7 +97,8 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
            weight_data_tm = weight_data_transposed;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -105,7 +106,8 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -619,7 +621,8 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)

        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -627,7 +630,8 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/riscv/gemm_riscv.cpp
+++ b/src/layer/riscv/gemm_riscv.cpp
@@ -3984,7 +3984,8 @@ int Gemm_riscv::create_pipeline(const Option& opt)
            }
        }

        A_data.release();
        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
@@ -4024,7 +4025,8 @@ int Gemm_riscv::create_pipeline(const Option& opt)
            }
        }

        B_data.release();
        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
@@ -4054,7 +4056,8 @@ int Gemm_riscv::create_pipeline(const Option& opt)
            CT_data = C2;
        }

        C_data.release();
        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -714,9 +714,12 @@ int GRU_riscv::create_pipeline_fp16sa(const Option& opt)
    cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt);
    cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt);

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -106,7 +106,8 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
        weight_data_tm = weight_data;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -560,7 +561,8 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)

    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/vulkan/batchnorm_vulkan.cpp
+++ b/src/layer/vulkan/batchnorm_vulkan.cpp
@@ -156,6 +156,12 @@ int BatchNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
        cmd.record_upload(b_data_packed, b_data_gpu, opt);
    }

    if (opt.lightmode)
    {
        a_data.release();
        b_data.release();
    }

    return 0;
 }

--- a/src/layer/vulkan/convolution1d_vulkan.cpp
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -133,8 +133,11 @@ int Convolution1D_vulkan::create_pipeline(const Option& _opt)
        pipeline_convolution1d->create(shader_type_index, opt, specializations);
    }

    weight_data.release();
    bias_data.release();
    if (opt.lightmode)
    {
        weight_data.release();
        bias_data.release();
    }

    return 0;
 }
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -1148,8 +1148,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
        pipeline_convolution->create(shader_type_index, opt, specializations);
    }

    weight_data.release();
    bias_data.release();
    if (opt.lightmode)
    {
        weight_data.release();
        bias_data.release();
    }

    return 0;
 }
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -271,8 +271,11 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
            pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
        }

        weight_data.release();
        bias_data.release();
        if (opt.lightmode)
        {
            weight_data.release();
            bias_data.release();
        }

        return 0;
    }
@@ -413,8 +416,11 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
        pipeline_convolutiondepthwise_group_pack8to1->create(LayerShaderType::convolutiondepthwise_group_pack8to1, opt, specializations);
    }

    weight_data.release();
    bias_data.release();
    if (opt.lightmode)
    {
        weight_data.release();
        bias_data.release();
    }

    return 0;
 }
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -366,6 +366,12 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
            pipeline_deconvolution_col2im->create(shader_type_index, opt, specializations);
        }

        if (opt.lightmode)
        {
            weight_data.release();
            bias_data.release();
        }

        return 0;
    }

@@ -462,8 +468,11 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
    pipeline_deconvolution->set_optimal_local_size_xyz(local_size_xyz);
    pipeline_deconvolution->create(shader_type_index, opt, specializations);

    weight_data.release();
    bias_data.release();
    if (opt.lightmode)
    {
        weight_data.release();
        bias_data.release();
    }

    return 0;
 }
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -295,8 +295,11 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
            pipeline_deconvolutiondepthwise_pack8->create(LayerShaderType::deconvolutiondepthwise_pack8, opt, specializations);
        }

        weight_data.release();
        bias_data.release();
        if (opt.lightmode)
        {
            weight_data.release();
            bias_data.release();
        }

        return 0;
    }
@@ -437,8 +440,11 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
        pipeline_deconvolutiondepthwise_group_pack8to1->create(LayerShaderType::deconvolutiondepthwise_group_pack8to1, opt, specializations);
    }

    weight_data.release();
    bias_data.release();
    if (opt.lightmode)
    {
        weight_data.release();
        bias_data.release();
    }

    return 0;
 }
--- a/src/layer/vulkan/gemm_vulkan.cpp
+++ b/src/layer/vulkan/gemm_vulkan.cpp
@@ -100,9 +100,12 @@ int Gemm_vulkan::create_pipeline(const Option& opt)
        pipeline_gemm->create(LayerShaderType::gemm, opt, specializations);
    }

    A_data.release();
    B_data.release();
    C_data.release();
    if (opt.lightmode)
    {
        A_data.release();
        B_data.release();
        C_data.release();
    }

    return 0;
 }
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -154,8 +154,11 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
        pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
        pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);

        weight_data.release();
        bias_data.release();
        if (opt.lightmode)
        {
            weight_data.release();
            bias_data.release();
        }

        return 0;
    }
@@ -364,14 +367,20 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
        pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
        pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);

        weight_data.release();
        bias_data.release();
        if (opt.lightmode)
        {
            weight_data.release();
            bias_data.release();
        }

        return 0;
    }

    weight_data.release();
    bias_data.release();
    if (opt.lightmode)
    {
        weight_data.release();
        bias_data.release();
    }

    return 0;
 }
--- a/src/layer/vulkan/memorydata_vulkan.cpp
+++ b/src/layer/vulkan/memorydata_vulkan.cpp
@@ -82,6 +82,11 @@ int MemoryData_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
        cmd.record_upload(data_packed, data_gpu, opt, /*bool flatten*/ false);
    }

    if (opt.lightmode)
    {
        data.release();
    }

    return 0;
 }

--- a/src/layer/vulkan/multiheadattention_vulkan.cpp
+++ b/src/layer/vulkan/multiheadattention_vulkan.cpp
@@ -73,8 +73,11 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
        q_gemm->load_model(ModelBinFromMatArray(weights));
        q_gemm->create_pipeline(opt);

        q_weight_data.release();
        q_bias_data.release();
        if (opt.lightmode)
        {
            q_weight_data.release();
            q_bias_data.release();
        }
    }

    {
@@ -100,8 +103,11 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
        k_gemm->load_model(ModelBinFromMatArray(weights));
        k_gemm->create_pipeline(opt);

        k_weight_data.release();
        k_bias_data.release();
        if (opt.lightmode)
        {
            k_weight_data.release();
            k_bias_data.release();
        }
    }

    {
@@ -127,8 +133,11 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
        v_gemm->load_model(ModelBinFromMatArray(weights));
        v_gemm->create_pipeline(opt);

        v_weight_data.release();
        v_bias_data.release();
        if (opt.lightmode)
        {
            v_weight_data.release();
            v_bias_data.release();
        }
    }

    {
@@ -222,8 +231,11 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
        o_gemm->load_model(ModelBinFromMatArray(weights));
        o_gemm->create_pipeline(opt);

        out_weight_data.release();
        out_bias_data.release();
        if (opt.lightmode)
        {
            out_weight_data.release();
            out_bias_data.release();
        }
    }

    return 0;
--- a/src/layer/vulkan/normalize_vulkan.cpp
+++ b/src/layer/vulkan/normalize_vulkan.cpp
@@ -264,6 +264,9 @@ int Normalize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
        {
            cmd.record_upload(scale_data_packed, scale_data_gpu, opt);
        }

        if (opt.lightmode)
            scale_data.release();
    }

    return 0;
--- a/src/layer/vulkan/padding_vulkan.cpp
+++ b/src/layer/vulkan/padding_vulkan.cpp
@@ -348,6 +348,11 @@ int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
        cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu, opt);
    }

    if (opt.lightmode)
    {
        per_channel_pad_data.release();
    }

    return 0;
 }

--- a/src/layer/vulkan/prelu_vulkan.cpp
+++ b/src/layer/vulkan/prelu_vulkan.cpp
@@ -144,6 +144,11 @@ int PReLU_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
        {
            cmd.record_upload(slope_data_packed, slope_data_gpu, opt);
        }

        if (opt.lightmode)
        {
            slope_data.release();
        }
    }

    return 0;
--- a/src/layer/vulkan/priorbox_vulkan.cpp
+++ b/src/layer/vulkan/priorbox_vulkan.cpp
@@ -129,6 +129,13 @@ int PriorBox_vulkan::upload_model(VkTransfer& cmd, const Option& opt)

    cmd.record_upload(aspect_ratios, aspect_ratios_gpu, opt);

    if (opt.lightmode)
    {
        min_sizes.release();
        max_sizes.release();
        aspect_ratios.release();
    }

    return 0;
 }

@@ -137,7 +144,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
    int w = bottom_blobs[0].w;
    int h = bottom_blobs[0].h;

    if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty())
    if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes_gpu.empty())
    {
        // mxnet style _contrib_MultiBoxPrior
        float step_w = step_width;
@@ -147,8 +154,8 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
        if (step_h == -233)
            step_h = 1.f / (float)h;

        int num_sizes = min_sizes.w;
        int num_ratios = aspect_ratios.w;
        int num_sizes = min_sizes_gpu.w;
        int num_ratios = aspect_ratios_gpu.w;

        int num_prior = num_sizes - 1 + num_ratios;

@@ -200,9 +207,9 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
    if (step_h == -233)
        step_h = (float)image_h / h;

    int num_min_size = min_sizes.w;
    int num_max_size = max_sizes.w;
    int num_aspect_ratio = aspect_ratios.w;
    int num_min_size = min_sizes_gpu.w;
    int num_max_size = max_sizes_gpu.w;
    int num_aspect_ratio = aspect_ratios_gpu.w;

    int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size;
    if (flip)
--- a/src/layer/vulkan/scale_vulkan.cpp
+++ b/src/layer/vulkan/scale_vulkan.cpp
@@ -218,6 +218,12 @@ int Scale_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
        }
    }

    if (opt.lightmode)
    {
        scale_data.release();
        bias_data.release();
    }

    return 0;
 }

--- a/src/layer/x86/convolution1d_x86.cpp
+++ b/src/layer/x86/convolution1d_x86.cpp
@@ -34,7 +34,7 @@ Convolution1D_x86::Convolution1D_x86()
 #endif // __SSE2__
 }

 int Convolution1D_x86::create_pipeline(const Option& /*opt*/)
 int Convolution1D_x86::create_pipeline(const Option& opt)
 {
    if (dynamic_weight)
        return 0;
@@ -43,7 +43,8 @@ int Convolution1D_x86::create_pipeline(const Option& /*opt*/)

    convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -334,7 +334,8 @@ int Convolution_x86::create_pipeline(const Option& opt)

        convolution_dilation1->create_pipeline(opt);

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -451,7 +452,8 @@ int Convolution_x86::create_pipeline(const Option& opt)
            }
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -542,7 +544,8 @@ int Convolution_x86::create_pipeline(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -1250,7 +1253,8 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -132,7 +132,8 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
            }
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -140,7 +141,8 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -843,7 +845,8 @@ int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
            weight_data_tm = weight_data;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -851,7 +854,8 @@ int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/x86/deconvolution_x86.cpp
+++ b/src/layer/x86/deconvolution_x86.cpp
@@ -193,7 +193,8 @@ int Deconvolution_x86::create_pipeline(const Option& opt)
        }
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/x86/deconvolutiondepthwise_x86.cpp
+++ b/src/layer/x86/deconvolutiondepthwise_x86.cpp
@@ -109,7 +109,8 @@ int DeconvolutionDepthWise_x86::create_pipeline(const Option& opt)
            weight_data_tm = weight_data_transposed;
        }

        weight_data.release();
        if (opt.lightmode)
            weight_data.release();

        return 0;
    }
@@ -117,7 +118,8 @@ int DeconvolutionDepthWise_x86::create_pipeline(const Option& opt)
    // group convolution
    create_group_ops(opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/x86/deformableconv2d_x86.cpp
+++ b/src/layer/x86/deformableconv2d_x86.cpp
@@ -203,7 +203,8 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
        deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -7235,7 +7235,8 @@ int Gemm_x86::create_pipeline(const Option& opt)
            }
        }

        A_data.release();
        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
@@ -7279,7 +7280,8 @@ int Gemm_x86::create_pipeline(const Option& opt)
            }
        }

        B_data.release();
        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
@@ -7315,7 +7317,8 @@ int Gemm_x86::create_pipeline(const Option& opt)
            CT_data = C2;
        }

        C_data.release();
        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -80,7 +80,8 @@ int InnerProduct_x86::create_pipeline(const Option& opt)

    innerproduct_transform_kernel_sse(weight_data, weight_data_tm, num_input, num_output, opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -175,7 +176,8 @@ int InnerProduct_x86::create_pipeline_fp16s(const Option& opt)

    innerproduct_transform_kernel_fp16s_sse(weight_data, weight_data_tm, num_input, num_output, opt);

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
@@ -281,7 +283,8 @@ int InnerProduct_x86::create_pipeline_int8_x86(const Option& opt)
        scale_in_data[p] = scale_in;
    }

    weight_data.release();
    if (opt.lightmode)
        weight_data.release();

    return 0;
 }
--- a/src/layer/x86/lstm_x86.cpp
+++ b/src/layer/x86/lstm_x86.cpp
@@ -182,9 +182,12 @@ int LSTM_x86::create_pipeline(const Option& opt)
        }
    }

    weight_xc_data.release();
    bias_c_data.release();
    weight_hc_data.release();
    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
 }
--- a/src/layer/x86/multiheadattention_x86.cpp
+++ b/src/layer/x86/multiheadattention_x86.cpp
@@ -65,8 +65,11 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
        q_gemm->load_model(ModelBinFromMatArray(weights));
        q_gemm->create_pipeline(opt);

        q_weight_data.release();
        q_bias_data.release();
        if (opt.lightmode)
        {
            q_weight_data.release();
            q_bias_data.release();
        }
    }

    {
@@ -91,8 +94,11 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
        k_gemm->load_model(ModelBinFromMatArray(weights));
        k_gemm->create_pipeline(opt);

        k_weight_data.release();
        k_bias_data.release();
        if (opt.lightmode)
        {
            k_weight_data.release();
            k_bias_data.release();
        }
    }

    {
@@ -117,8 +123,11 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
        v_gemm->load_model(ModelBinFromMatArray(weights));
        v_gemm->create_pipeline(opt);

        v_weight_data.release();
        v_bias_data.release();
        if (opt.lightmode)
        {
            v_weight_data.release();
            v_bias_data.release();
        }
    }

    {
@@ -193,8 +202,11 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
        o_gemm->load_model(ModelBinFromMatArray(weights));
        o_gemm->create_pipeline(opt);

        out_weight_data.release();
        out_bias_data.release();
        if (opt.lightmode)
        {
            out_weight_data.release();
            out_bias_data.release();
        }
    }

    return 0;
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -32,6 +32,7 @@
 #include "layer/batchnorm.h"
 #include "layer/bias.h"
 #include "layer/binaryop.h"
 #include "layer/celu.h"
 #include "layer/clip.h"
 #include "layer/concat.h"
 #include "layer/convolution.h"
@@ -51,6 +52,7 @@
 #include "layer/deconvolutiondepthwise3d.h"
 #include "layer/deformableconv2d.h"
 #include "layer/detectionoutput.h"
 #include "layer/diag.h"
 #include "layer/dropout.h"
 #include "layer/eltwise.h"
 #include "layer/elu.h"
@@ -835,6 +837,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            fprintf_param_value(" 1=%d", with_scalar)
            fprintf_param_value(" 2=%e", b)
        }
        else if (layer->type == "CELU")
        {
            ncnn::CELU* op = (ncnn::CELU*)layer;
            ncnn::CELU* op_default = (ncnn::CELU*)layer_default;

            fprintf_param_value(" 0=%e", alpha)
        }
        else if (layer->type == "Clip")
        {
            ncnn::Clip* op = (ncnn::Clip*)layer;
@@ -888,18 +897,21 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            }
            fprintf_param_value(" 19=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);

 #if NCNN_INT8
            // write int8_scale data
            if (op->int8_scale_term)
            {
                fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100);
                fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1);
                fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1);
            }
                // write int8_scale data
                if (op->int8_scale_term)
                {
                    fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100);
                    fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1);
                    fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1);
                }
 #endif // NCNN_INT8
            }

            if (shape_ready)
            {
@@ -931,9 +943,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            {
                if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp);
            }
            fprintf_param_value(" 19=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);
            }

            if (shape_ready)
            {
@@ -1040,32 +1056,35 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            }
            fprintf_param_value(" 19=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);

 #if NCNN_INT8
            // write int8_scale data
            if (op->int8_scale_term == 1 || op->int8_scale_term == 101)
            {
                op->bottom_blob_int8_scales.w = 1;
            }
            if (op->int8_scale_term == 2 || op->int8_scale_term == 102)
            {
                op->weight_data_int8_scales.w = 1;
                op->bottom_blob_int8_scales.w = 1;
            }
            if (op->int8_scale_term > 100)
            {
                op->top_blob_int8_scales.w = 1;
            }
                // write int8_scale data
                if (op->int8_scale_term == 1 || op->int8_scale_term == 101)
                {
                    op->bottom_blob_int8_scales.w = 1;
                }
                if (op->int8_scale_term == 2 || op->int8_scale_term == 102)
                {
                    op->weight_data_int8_scales.w = 1;
                    op->bottom_blob_int8_scales.w = 1;
                }
                if (op->int8_scale_term > 100)
                {
                    op->top_blob_int8_scales.w = 1;
                }

            if (op->int8_scale_term)
            {
                fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100);
                fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1);
                fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1);
            }
                if (op->int8_scale_term)
                {
                    fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100);
                    fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1);
                    fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1);
                }
 #endif // NCNN_INT8
            }

            if (shape_ready)
            {
@@ -1098,9 +1117,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            {
                if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp);
            }
            fprintf_param_value(" 19=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);
            }

            if (shape_ready)
            {
@@ -1261,9 +1284,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            {
                if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp);
            }
            fprintf_param_value(" 28=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);
            }

            if (shape_ready)
            {
@@ -1296,9 +1323,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            {
                if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp);
            }
            fprintf_param_value(" 28=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);
            }

            if (shape_ready)
            {
@@ -1418,9 +1449,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            {
                if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp);
            }
            fprintf_param_value(" 28=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);
            }

            if (shape_ready)
            {
@@ -1454,9 +1489,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            {
                if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp);
            }
            fprintf_param_value(" 28=%d", dynamic_weight)

            fwrite_weight_tag_data(op->weight_data, bp);
            fwrite_weight_data(op->bias_data, bp);
            if (op->dynamic_weight == 0)
            {
                fwrite_weight_tag_data(op->weight_data, bp);
                fwrite_weight_data(op->bias_data, bp);
            }

            if (shape_ready)
            {
@@ -1597,6 +1636,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
            fprintf_param_value(" 7=%e", variances[2])
            fprintf_param_value(" 8=%e", variances[3])
        }
        else if (layer->type == "Diag")
        {
            ncnn::Diag* op = (ncnn::Diag*)layer;
            ncnn::Diag* op_default = (ncnn::Diag*)layer_default;

            fprintf_param_value(" 0=%d", diagonal)
        }
        else if (layer->type == "Dropout")
        {
            ncnn::Dropout* op = (ncnn::Dropout*)layer;