nihui
/
ncnn

 
			
			   
				 
					
						
						
							
							# Tencent is pleased to support the open source community by making ncnn available.
#
# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.

import torch
import torch.nn as nn
import torch.nn.functional as F
from packaging import version

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.attention_0_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4)
        self.attention_0_1 = nn.MultiheadAttention(embed_dim=64, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False)
        self.attention_0_2 = nn.MultiheadAttention(embed_dim=64, num_heads=16, bias=True, add_bias_kv=True, add_zero_attn=True)

        self.attention_0_3 = nn.MultiheadAttention(embed_dim=32, num_heads=8, bias=True)

        self.attention_0_4 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20)
        self.attention_0_5 = nn.MultiheadAttention(embed_dim=40, num_heads=8, kdim=30, vdim=20, bias=False, add_bias_kv=False, add_zero_attn=False)
        self.attention_0_6 = nn.MultiheadAttention(embed_dim=40, num_heads=10, kdim=30, vdim=20, bias=True, add_bias_kv=True, add_zero_attn=True)

        if version.parse(torch.__version__) >= version.parse('1.9'):
            self.attention_1_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4, batch_first=True)
            self.attention_1_1 = nn.MultiheadAttention(embed_dim=64, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True)
            self.attention_1_2 = nn.MultiheadAttention(embed_dim=64, num_heads=16, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True)

            self.attention_1_3 = nn.MultiheadAttention(embed_dim=32, num_heads=8, bias=True, batch_first=True)

            self.attention_1_4 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20, batch_first=True)
            self.attention_1_5 = nn.MultiheadAttention(embed_dim=40, num_heads=8, kdim=30, vdim=20, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True)
            self.attention_1_6 = nn.MultiheadAttention(embed_dim=40, num_heads=10, kdim=30, vdim=20, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True)

    def forward(self, xq, xk, xv, z, zmask, yq, yk, yv, ymask, ymask2):
        x0, x0w = self.attention_0_0(xq, xk, xv)
        x1, x1w = self.attention_0_1(xq, xk, xv)
        x2, x2w = self.attention_0_2(xq, xk, xk)

        x3, _ = self.attention_0_3(z, z, z, need_weights=False)
        x33, _ = self.attention_0_3(z, z, z, attn_mask=zmask)

        x4, x4w = self.attention_0_4(yq, yk, yv)
        x5, x5w = self.attention_0_5(yq, yk, yv, attn_mask=ymask)
        x6, x6w = self.attention_0_6(yq, yk, yv, attn_mask=ymask2)

        if version.parse(torch.__version__) < version.parse('1.9'):
            return x0, x0w, x1, x1w, x2, x2w, x3, x33, x4, x4w, x5, x5w, x6, x6w

        xq = xq.transpose(0, 1)
        xk = xk.transpose(0, 1)
        xv = xv.transpose(0, 1)
        z = z.transpose(0, 1)
        yq = yq.transpose(0, 1)
        yk = yk.transpose(0, 1)
        yv = yv.transpose(0, 1)

        y0, y0w = self.attention_1_0(xq, xk, xv)
        y1, y1w = self.attention_1_1(xq, xk, xv)
        y2, y2w = self.attention_1_2(xq, xk, xk)

        y3, _ = self.attention_1_3(z, z, z)
        if version.parse(torch.__version__) >= version.parse('1.12') and version.parse(torch.__version__) < version.parse('1.13'):
            # HACK pytorch 1.12 breaks 2-dim zmask
            # https://github.com/pytorch/pytorch/issues/97409
            # zmask2 = zmask.reshape(1, 1, 30, 30).expand(1, 8, 30, 30)
            # y33, _ = self.attention_1_3(z, z, z, attn_mask=zmask2)
            # but it produce all nan then, skip test :(
            y33 = y3
        elif version.parse(torch.__version__) >= version.parse('2.0') and version.parse(torch.__version__) < version.parse('2.1'):
            # HACK pytorch 2.0 produce all nan, skip test :(
            y33 = y3
        else:
            y33, _ = self.attention_1_3(z, z, z, attn_mask=zmask)

        y4, y4w = self.attention_1_4(yq, yk, yv)
        y5, y5w = self.attention_1_5(yq, yk, yv, attn_mask=ymask)
        y6, y6w = self.attention_1_6(yq, yk, yv, attn_mask=ymask2)

        return x0, x0w, x1, x1w, x2, x2w, x3, x33, x4, x4w, x5, x5w, x6, x6w, y0, y0w, y1, y1w, y2, y2w, y3, y33, y4, y4w, y5, y5w, y6, y6w

def test():
    torch.set_grad_enabled(False)

    net = Model()
    net.eval()

    torch.manual_seed(0)
    xq = torch.rand(20, 1, 64)
    xk = torch.rand(20, 1, 64)
    xv = torch.rand(20, 1, 64)
    z = torch.rand(30, 1, 32)
    zmask = torch.rand(30, 30)
    yq = torch.rand(15, 1, 40)
    yk = torch.rand(24, 1, 30)
    yv = torch.rand(24, 1, 20)
    ymask = torch.rand(15, 24)
    ymask2 = torch.rand(10, 15, 24)

    a = net(xq, xk, xv, z, zmask, yq, yk, yv, ymask, ymask2)

    # export torchscript
    print(torch.__version__)
    if version.parse(torch.__version__) >= version.parse('1.12.0'):
        mod = torch.jit.trace(net, (xq, xk, xv, z, zmask, yq, yk, yv, ymask, ymask2), check_trace=False)
    else:
        mod = torch.jit.trace(net, (xq, xk, xv, z, zmask, yq, yk, yv, ymask, ymask2))
    mod.save("test_nn_MultiheadAttention.pt")

    # torchscript to pnnx
    import os
    os.system("../src/pnnx test_nn_MultiheadAttention.pt inputshape=[20,1,64],[20,1,64],[20,1,64],[30,1,32],[30,30],[15,1,40],[24,1,30],[24,1,20],[15,24],[10,15,24]")

    # pnnx inference
    import test_nn_MultiheadAttention_pnnx
    b = test_nn_MultiheadAttention_pnnx.test_inference()

    for a0, b0 in zip(a, b):
        if not torch.allclose(a0, b0, 1e-4, 1e-4):
            return False
    return True

if __name__ == "__main__":
    if test():
        exit(0)
    else:
        exit(1)