1. 两种方法实现Patch_Embedding
import torch
from torch.nn import functional as F
def image2embed_naive(image, patch_size, weight):
"""
:param image: [bs,in_channel,height,width]
:param patch_size:
:param weight : weight.shape=[patch_depth=in_channel*patch_size*patch_size,model_dim_C]
:return: patch_embedding,it shape is [batch_size,num_patches,model_dim_C]
"""
image_output = F.unfold(image, kernel_size=(patch_size, patch_size),
stride=(patch_size, patch_size)).transpose(-1, -2)
patch_embedding = image_output @ weight
return patch_embedding
def image2conv(image, weight, patch_size):
conv_output = F.conv2d(image, weight=weight, stride=patch_size)
bs, oc, oh, ow = conv_output.shape
patch_embedding = conv_output.reshape(bs, oc, oh * ow).transpose(-1,-2)
return patch_embedding
batch_size = 1
in_channel = 2
out_channel = 5
height = 3
width = 4
input = torch.randn(batch_size, in_channel, height, width)
patch_size = 2
weight1_depth = in_channel * patch_size * patch_size
weight1_model_c = out_channel
weight1 = torch.randn(weight1_depth,weight1_model_c)
weight2_out_channel = weight1_model_c
weight2 = weight1.transpose(0,1).reshape(weight1_model_c,in_channel,patch_size,patch_size)
output1 = image2embed_naive(input, patch_size, weight1)
output2 = image2conv(input, weight2, patch_size)
flag = torch.isclose(output1,output2)
print(f"flag={flag}")
print(f"output1={output1}")
print(f"output2={output2}")
print(f"output1.shape={output1.shape}")
print(f"output2.shape={output2.shape}")
2. 多头自注意力(Multi_Head_Self_Attention)
|