网站建设轮播图,网站优化 价格查询,wordpress添加水印有必要,沈阳唐朝网络的服务内容本文介绍一些注意力机制的实现#xff0c;包括EA/MHSA/SK/DA/EPSA。
【深度学习】注意力机制#xff08;一#xff09;
【深度学习】注意力机制#xff08;三#xff09;
目录
一、EA#xff08;External Attention#xff09;
二、Multi Head Self Attention
三、…本文介绍一些注意力机制的实现包括EA/MHSA/SK/DA/EPSA。
【深度学习】注意力机制一
【深度学习】注意力机制三
目录
一、EAExternal Attention
二、Multi Head Self Attention
三、SKSelective Kernel Networks
四、DADual Attention
五、EPSAEfficient Pyramid Squeeze Attention 一、EAExternal Attention
EA可以关注全局的空间信息论文论文地址
如下图 代码如下代码连接
import numpy as np
import torch
from torch import nn
from torch.nn import initclass External_attention(nn.Module):Arguments:c (int): The input and output channel number.def __init__(self, c):super(External_attention, self).__init__()self.conv1 nn.Conv2d(c, c, 1)self.k 64self.linear_0 nn.Conv1d(c, self.k, 1, biasFalse)self.linear_1 nn.Conv1d(self.k, c, 1, biasFalse)self.linear_1.weight.data self.linear_0.weight.data.permute(1, 0, 2) self.conv2 nn.Sequential(nn.Conv2d(c, c, 1, biasFalse),norm_layer(c)) for m in self.modules():if isinstance(m, nn.Conv2d):n m.kernel_size[0] * m.kernel_size[1] * m.out_channelsm.weight.data.normal_(0, math.sqrt(2. / n))elif isinstance(m, nn.Conv1d):n m.kernel_size[0] * m.out_channelsm.weight.data.normal_(0, math.sqrt(2. / n))elif isinstance(m, _BatchNorm):m.weight.data.fill_(1)if m.bias is not None:m.bias.data.zero_()def forward(self, x):idn xx self.conv1(x)b, c, h, w x.size()n h*wx x.view(b, c, h*w) # b * c * n attn self.linear_0(x) # b, k, nattn F.softmax(attn, dim-1) # b, k, nattn attn / (1e-9 attn.sum(dim1, keepdimTrue)) # # b, k, nx self.linear_1(attn) # b, c, nx x.view(b, c, h, w)x self.conv2(x)x x idnx F.relu(x)return x
二、Multi Head Self Attention
注意力机制的经典Transformer的基石。论文论文地址
如下图 代码如下代码连接
import numpy as np
import torch
from torch import nn
from torch.nn import initclass ScaledDotProductAttention(nn.Module):Scaled dot-product attentiondef __init__(self, d_model, d_k, d_v, h,dropout.1)::param d_model: Output dimensionality of the model:param d_k: Dimensionality of queries and keys:param d_v: Dimensionality of values:param h: Number of headssuper(ScaledDotProductAttention, self).__init__()self.fc_q nn.Linear(d_model, h * d_k)self.fc_k nn.Linear(d_model, h * d_k)self.fc_v nn.Linear(d_model, h * d_v)self.fc_o nn.Linear(h * d_v, d_model)self.dropoutnn.Dropout(dropout)self.d_model d_modelself.d_k d_kself.d_v d_vself.h hself.init_weights()def init_weights(self):for m in self.modules():if isinstance(m, nn.Conv2d):init.kaiming_normal_(m.weight, modefan_out)if m.bias is not None:init.constant_(m.bias, 0)elif isinstance(m, nn.BatchNorm2d):init.constant_(m.weight, 1)init.constant_(m.bias, 0)elif isinstance(m, nn.Linear):init.normal_(m.weight, std0.001)if m.bias is not None:init.constant_(m.bias, 0)def forward(self, queries, keys, values, attention_maskNone, attention_weightsNone):Computes:param queries: Queries (b_s, nq, d_model):param keys: Keys (b_s, nk, d_model):param values: Values (b_s, nk, d_model):param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking.:param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).:return:b_s, nq queries.shape[:2]nk keys.shape[1]q self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3) # (b_s, h, nq, d_k)k self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1) # (b_s, h, d_k, nk)v self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3) # (b_s, h, nk, d_v)att torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk)if attention_weights is not None:att att * attention_weightsif attention_mask is not None:att att.masked_fill(attention_mask, -np.inf)att torch.softmax(att, -1)attself.dropout(att)out torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v) # (b_s, nq, h*d_v)out self.fc_o(out) # (b_s, nq, d_model)return out
三、SKSelective Kernel Networks
SK是通道注意力机制。论文地址论文连接
如下图 代码如下代码连接
import numpy as np
import torch
from torch import nn
from torch.nn import init
from collections import OrderedDictclass SKAttention(nn.Module):def __init__(self, channel512,kernels[1,3,5,7],reduction16,group1,L32):super().__init__()self.dmax(L,channel//reduction)self.convsnn.ModuleList([])for k in kernels:self.convs.append(nn.Sequential(OrderedDict([(conv,nn.Conv2d(channel,channel,kernel_sizek,paddingk//2,groupsgroup)),(bn,nn.BatchNorm2d(channel)),(relu,nn.ReLU())])))self.fcnn.Linear(channel,self.d)self.fcsnn.ModuleList([])for i in range(len(kernels)):self.fcs.append(nn.Linear(self.d,channel))self.softmaxnn.Softmax(dim0)def forward(self, x):bs, c, _, _ x.size()conv_outs[]### splitfor conv in self.convs:conv_outs.append(conv(x))featstorch.stack(conv_outs,0)#k,bs,channel,h,w### fuseUsum(conv_outs) #bs,c,h,w### reduction channelSU.mean(-1).mean(-1) #bs,cZself.fc(S) #bs,d### calculate attention weightweights[]for fc in self.fcs:weightfc(Z)weights.append(weight.view(bs,c,1,1)) #bs,channelattention_weughtstorch.stack(weights,0)#k,bs,channel,1,1attention_weughtsself.softmax(attention_weughts)#k,bs,channel,1,1### fuseV(attention_weughts*feats).sum(0)return V四、DADual Attention
DA融合了通道注意力和空间注意力机制。论文论文地址
如下图 代码代码连接
import numpy as np
import torch
from torch import nn
from torch.nn import init
from model.attention.SelfAttention import ScaledDotProductAttention
from model.attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttentionclass PositionAttentionModule(nn.Module):def __init__(self,d_model512,kernel_size3,H7,W7):super().__init__()self.cnnnn.Conv2d(d_model,d_model,kernel_sizekernel_size,padding(kernel_size-1)//2)self.paScaledDotProductAttention(d_model,d_kd_model,d_vd_model,h1)def forward(self,x):bs,c,h,wx.shapeyself.cnn(x)yy.view(bs,c,-1).permute(0,2,1) #bs,h*w,cyself.pa(y,y,y) #bs,h*w,creturn yclass ChannelAttentionModule(nn.Module):def __init__(self,d_model512,kernel_size3,H7,W7):super().__init__()self.cnnnn.Conv2d(d_model,d_model,kernel_sizekernel_size,padding(kernel_size-1)//2)self.paSimplifiedScaledDotProductAttention(H*W,h1)def forward(self,x):bs,c,h,wx.shapeyself.cnn(x)yy.view(bs,c,-1) #bs,c,h*wyself.pa(y,y,y) #bs,c,h*wreturn yclass DAModule(nn.Module):def __init__(self,d_model512,kernel_size3,H7,W7):super().__init__()self.position_attention_modulePositionAttentionModule(d_model512,kernel_size3,H7,W7)self.channel_attention_moduleChannelAttentionModule(d_model512,kernel_size3,H7,W7)def forward(self,input):bs,c,h,winput.shapep_outself.position_attention_module(input)c_outself.channel_attention_module(input)p_outp_out.permute(0,2,1).view(bs,c,h,w)c_outc_out.view(bs,c,h,w)return p_outc_out
五、EPSAEfficient Pyramid Squeeze Attention
论文论文地址
如下图 代码如下代码连接
import torch.nn as nnclass SEWeightModule(nn.Module):def __init__(self, channels, reduction16):super(SEWeightModule, self).__init__()self.avg_pool nn.AdaptiveAvgPool2d(1)self.fc1 nn.Conv2d(channels, channels//reduction, kernel_size1, padding0)self.relu nn.ReLU(inplaceTrue)self.fc2 nn.Conv2d(channels//reduction, channels, kernel_size1, padding0)self.sigmoid nn.Sigmoid()def forward(self, x):out self.avg_pool(x)out self.fc1(out)out self.relu(out)out self.fc2(out)weight self.sigmoid(out)return weightdef conv(in_planes, out_planes, kernel_size3, stride1, padding1, dilation1, groups1):standard convolution with paddingreturn nn.Conv2d(in_planes, out_planes, kernel_sizekernel_size, stridestride,paddingpadding, dilationdilation, groupsgroups, biasFalse)def conv1x1(in_planes, out_planes, stride1):1x1 convolutionreturn nn.Conv2d(in_planes, out_planes, kernel_size1, stridestride, biasFalse)class PSAModule(nn.Module):def __init__(self, inplans, planes, conv_kernels[3, 5, 7, 9], stride1, conv_groups[1, 4, 8, 16]):super(PSAModule, self).__init__()self.conv_1 conv(inplans, planes//4, kernel_sizeconv_kernels[0], paddingconv_kernels[0]//2,stridestride, groupsconv_groups[0])self.conv_2 conv(inplans, planes//4, kernel_sizeconv_kernels[1], paddingconv_kernels[1]//2,stridestride, groupsconv_groups[1])self.conv_3 conv(inplans, planes//4, kernel_sizeconv_kernels[2], paddingconv_kernels[2]//2,stridestride, groupsconv_groups[2])self.conv_4 conv(inplans, planes//4, kernel_sizeconv_kernels[3], paddingconv_kernels[3]//2,stridestride, groupsconv_groups[3])self.se SEWeightModule(planes // 4)self.split_channel planes // 4self.softmax nn.Softmax(dim1)def forward(self, x):batch_size x.shape[0]x1 self.conv_1(x)x2 self.conv_2(x)x3 self.conv_3(x)x4 self.conv_4(x)feats torch.cat((x1, x2, x3, x4), dim1)feats feats.view(batch_size, 4, self.split_channel, feats.shape[2], feats.shape[3])x1_se self.se(x1)x2_se self.se(x2)x3_se self.se(x3)x4_se self.se(x4)x_se torch.cat((x1_se, x2_se, x3_se, x4_se), dim1)attention_vectors x_se.view(batch_size, 4, self.split_channel, 1, 1)attention_vectors self.softmax(attention_vectors)feats_weight feats * attention_vectorsfor i in range(4):x_se_weight_fp feats_weight[:, i, :, :]if i 0:out x_se_weight_fpelse:out torch.cat((x_se_weight_fp, out), 1)return out