Swin—UNet代码学习（pytorch）

model流程学习： self.swin_unet = SwinTransformerSys(img_size=config.DATA.IMG_SIZE,patch_size=config.MODEL.SWIN.PATCH_SIZE,in_chans=config.MODEL.SWIN.IN_CHANS,num_classes=self.num_classes,embed_dim=config.MODEL.SWIN.EMBED_DIM,depths=config.MODEL.SWIN.DEPTHS,num_heads=config.MODEL.SWIN.NUM_HEADS,window_size=config.MODEL.SWIN.WINDOW_SIZE,mlp_ratio=config.MODEL.SWIN.MLP_RATIO,qkv_bias=config.MODEL.SWIN.QKV_BIAS,qk_scale=config.MODEL.SWIN.QK_SCALE,drop_rate=config.MODEL.DROP_RATE,drop_path_rate=config.MODEL.DROP_PATH_RATE,ape=config.MODEL.SWIN.APE,patch_norm=config.MODEL.SWIN.PATCH_NORM,use_checkpoint=config.TRAIN.USE_CHECKPOINT)

图片大小设置为224*224*3（输入图片为3通道的），patch_size:4，num_class:21843(类别数量)，embed_dim：线性投影输出通道数Swin Transformer

参数设置def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 2, 2], depths_decoder=[1, 2, 2, 2], num_heads= [3, 6, 12, 24],window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, use_checkpoint=False, final_upsample="expand_first", **kwargs):

Patch Embedding

划分patch：模型先将图像分割成每块大小为 4x4的patch，那么就会有56x56个patch。--patchEmbed。

编码初始特征：conv卷积（3，96，4，4），经过norm后56x56x96。# 将图片分割为不重叠的面片self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,norm_layer=norm_layer if self.patch_norm else None)num_patches = self.patch_embed.num_patchespatches_resolution = self.patch_embed.patches_resolutionself.patches_resolution = patches_resolution

Attention:预处理完成后，维度是H/4XW/4XC。

BasicLayer：

PatchMerging：downsample

每个block是W-MSA（window-multihead self attention）或者SW-MSA（shift window multihead self attention），一般有偶数个block，两种block交替出现，比如6个block，0，2，4是W-MSA，1，3，5是SW-MSA。在经历完一个stage后，会进行下采样，依旧是56*56class PatchMerging(nn.Module):r""" Patch Merging Layer.Args:input_resolution (tuple[int]): Resolution of input feature.dim (int): Number of input channels.norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm"""def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):super().__init__()self.input_resolution = input_resolutionself.dim = dimself.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)self.norm = norm_layer(4 * dim)def forward(self, x):"""x: B, H*W, C"""H, W = self.input_resolutionB, L, C = x.shapeassert L == H * W, "input feature has wrong size"assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."x = x.view(B, H, W, C)x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 Cx1 = x[:, 1::2, 0::2, :] # B H/2 W/2 Cx2 = x[:, 0::2, 1::2, :] # B H/2 W/2 Cx3 = x[:, 1::2, 1::2, :] # B H/2 W/2 Cx = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*Cx = x.view(B, -1, 4 * C) # B H/2*W/2 4*Cx = self.norm(x)x = self.reduction(x)return x

个patch，四个为一组，分别取每组中的左上，右上、左下、右下堆叠一起，经过一个layernorm，linear层，实现维度下采样、特征加倍的效果。实际上它可以看成一种加权池化的过程.--进行四次下采样。

SwinTransformerBlock：

shift_size<windowsize<imput_resolution< p="">

windowattention:class WindowAttention(nn.Module):def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):super().__init__()self.dim = dimself.window_size = window_size # Wh, Wwself.num_heads = num_headshead_dim = dim // num_headsself.scale = qk_scale or head_dim ** -0.5# define a parameter table of relative position biasself.relative_position_bias_table = nn.Parameter(torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH# get pair-wise relative position index for each token inside the windowcoords_h = torch.arange(self.window_size[0])coords_w = torch.arange(self.window_size[1])coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Wwcoords_flatten = torch.flatten(coords, 1) # 2, Wh*Wwrelative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wwrelative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0relative_coords[:, :, 1] += self.window_size[1] - 1relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Wwself.register_buffer("relative_position_index", relative_position_index)self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)self.attn_drop = nn.Dropout(attn_drop)self.proj = nn.Linear(dim, dim)self.proj_drop = nn.Dropout(proj_drop)trunc_normal_(self.relative_position_bias_table, std=.02)self.softmax = nn.Softmax(dim=-1)def forward(self, x, mask=None):"""Args:x: input features with shape of (num_windows*B, N, C)mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None"""B_, N, C = x.shapeqkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)q = q * self.scaleattn = (q @ k.transpose(-2, -1))relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nHrelative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Wwattn = attn + relative_position_bias.unsqueeze(0)if mask is not None:nW = mask.shape[0]attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)attn = attn.view(-1, self.num_heads, N, N)attn = self.softmax(attn)else:attn = self.softmax(attn)attn = self.attn_drop(attn)x = (attn @ v).transpose(1, 2).reshape(B_, N, C)x = self.proj(x)x = self.proj_drop(x)return x

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。