本文來對 DSANet 從源碼的角度做自定向下的分析:
總體結構
3 個支路:全局自注意力模塊、局部自注意力模塊、線性自迴歸模塊。
參數含義
參數 | 含義 |
---|---|
window (int) | the length of the input window size |
n_multiv (int) | num of univariate time series |
n_kernels (int) | the num of channels |
w_kernel (int) | the default is 1,初始通道數 |
local (int) | 1維卷積核寬度 |
d_k (int) | d_model / n_head |
d_v (int) | d_model / n_head |
d_model (int) | outputs of dimension |
d_inner (int) | the inner-layer dimension of Position-wise Feed-Forward Networks |
n_layers (int) | num of layers in Encoder |
n_head (int) | num of Multi-head |
drop_prob (float) | the probability of dropout |
class DSANet(LightningModule):
def __init__(self, hparams):
"""
Pass in parsed HyperOptArgumentParser to the model
"""
super(DSANet, self).__init__()
... # init variables
# build model
self.__build_model()
def __build_model(self):
"""
Layout model
"""
self.sgsf = Single_Global_SelfAttn_Module(
window=self.window, n_multiv=self.n_multiv, n_kernels=self.n_kernels,
w_kernel=self.w_kernel, d_k=self.d_k, d_v=self.d_v, d_model=self.d_model,
d_inner=self.d_inner, n_layers=self.n_layers, n_head=self.n_head, drop_prob=self.drop_prob)
self.slsf = Single_Local_SelfAttn_Module(
window=self.window, local=self.local, n_multiv=self.n_multiv, n_kernels=self.n_kernels,
w_kernel=self.w_kernel, d_k=self.d_k, d_v=self.d_v, d_model=self.d_model,
d_inner=self.d_inner, n_layers=self.n_layers, n_head=self.n_head, drop_prob=self.drop_prob)
self.ar = AR(window=self.window)
self.W_output1 = nn.Linear(2 * self.n_kernels, 1)
self.dropout = nn.Dropout(p=self.drop_prob)
self.active_func = nn.Tanh()
def forward(self, x):
"""
:param x: [batch, window, n_multiv]
:return: output: [batch, 1, n_multiv]
"""
sgsf_output, *_ = self.sgsf(x) # sgsf_output: [batch, n_multiv, n_kernel]
slsf_output, *_ = self.slsf(x) # slsf_output: [batch, n_multiv, n_kernel]
sf_output = torch.cat((sgsf_output, slsf_output), 2) # sf_output: [batch, n_multiv, 2*n_kernel]
sf_output = self.dropout(sf_output)
sf_output = self.W_output1(sf_output) # sf_output: [batch, n_multiv, 1]
sf_output = torch.transpose(sf_output, 1, 2) # sf_output: [batch, 1, n_multiv]
ar_output = self.ar(x) # ar_output: [batch, 1, n_multiv]
output = sf_output + ar_output
return output
全局自注意力模塊
class Single_Global_SelfAttn_Module(nn.Module):
def __init__(
self,
window, n_multiv, n_kernels, w_kernel,
d_k, d_v, d_model, d_inner,
n_layers, n_head, drop_prob=0.1):
super(Single_Global_SelfAttn_Module, self).__init__()
self.window = window
self.w_kernel = w_kernel
self.n_multiv = n_multiv
self.d_model = d_model
self.drop_prob = drop_prob
self.conv2 = nn.Conv2d(1, n_kernels, (window, w_kernel))
self.in_linear = nn.Linear(n_kernels, d_model)
self.out_linear = nn.Linear(d_model, n_kernels)
self.layer_stack = nn.ModuleList([
EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=drop_prob)
for _ in range(n_layers)])
def forward(self, x, return_attns=False):
x = x.view(-1, self.w_kernel, self.window, self.n_multiv) # x: [batch, 1, window, n_multiv]
x2 = F.relu(self.conv2(x)) # x2: [batch, n_kernels, 1, n_multiv]
x2 = nn.Dropout(p=self.drop_prob)(x2)
x = torch.squeeze(x2, 2) # x2: [batch, n_kernels, n_multiv]
x = torch.transpose(x, 1, 2) # x2: [batch, n_multiv, n_kernels]
src_seq = self.in_linear(x) # src_seq: [batch, n_multiv, d_model]
enc_slf_attn_list = []
enc_output = src_seq # enc_output: [batch, n_multiv, d_model]
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(enc_output)
# enc_output: [batch, n_multiv, d_model]
# enc_slf_attn: [batch, n_multiv, n_multiv]
if return_attns:
enc_slf_attn_list += [enc_slf_attn]
enc_output = self.out_linear(enc_output) # enc_output: [batch, n_multiv, n_kernels]
if return_attns:
return enc_output, enc_slf_attn_list
return enc_output,
局部自注意力模塊
全局和局部的區別體現在1維卷積核的寬度,全局的卷積核寬度等於窗寬,而局部卷積核小於窗寬,次數卷積的結果不是一個數,而是一個更短的序列,所以在卷積的結果上再加一個 maxpooling。
class Single_Local_SelfAttn_Module(nn.Module):
def __init__(
self,
window, local, n_multiv, n_kernels, w_kernel,
d_k, d_v, d_model, d_inner,
n_layers, n_head, drop_prob=0.1):
super(Single_Local_SelfAttn_Module, self).__init__()
self.window = window
self.w_kernel = w_kernel
self.n_multiv = n_multiv
self.d_model = d_model
self.drop_prob = drop_prob
self.conv1 = nn.Conv2d(1, n_kernels, (local, w_kernel))
self.pooling1 = nn.AdaptiveMaxPool2d((1, n_multiv))
self.in_linear = nn.Linear(n_kernels, d_model)
self.out_linear = nn.Linear(d_model, n_kernels)
self.layer_stack = nn.ModuleList([
EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=drop_prob)
for _ in range(n_layers)])
def forward(self, x, return_attns=False):
# x: [batch, window, n_multiv]
x = x.view(-1, self.w_kernel, self.window, self.n_multiv) # x: [batch, 1, window, n_multiv]
x1 = F.relu(self.conv1(x)) # x1: [batch, n_kernel, window-kernelsize+1, n_multiv]
x1 = self.pooling1(x1) # x1: [batch, n_kernel, 1, n_multiv]
x1 = nn.Dropout(p=self.drop_prob)(x1)
x = torch.squeeze(x1, 2) # x: [batch, n_kernel, n_multiv]
x = torch.transpose(x, 1, 2) # x: [batch, n_multiv, n_kernel]
src_seq = self.in_linear(x) # src_seq: [batch, n_multiv, d_model]
enc_slf_attn_list = []
enc_output = src_seq
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(enc_output) # enc_output: [batch, n_multiv, d_model]
if return_attns:
enc_slf_attn_list += [enc_slf_attn]
enc_output = self.out_linear(enc_output) # enc_output: [batch, n_multiv, n_kernels]
if return_attns:
return enc_output, enc_slf_attn_list
return enc_output,
線性自迴歸模塊
對所有序列用同一組自迴歸係數
class AR(nn.Module):
def __init__(self, window):
super(AR, self).__init__()
self.linear = nn.Linear(window, 1)
def forward(self, x):
# x: [batch, window, n_multiv]
x = torch.transpose(x, 1, 2) # x: [batch, n_multiv, window]
x = self.linear(x) # x: [batch, n_multiv, 1]
x = torch.transpose(x, 1, 2) # x: [batch, 1, n_multiv]
return x
EncoderLayer
EncoderLayer 有兩部分:多頭注意力 + 前饋網絡
class EncoderLayer(nn.Module):
''' Compose with two layers '''
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
super(EncoderLayer, self).__init__()
self.slf_attn = MultiHeadAttention(
n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def forward(self, enc_input):
enc_output, enc_slf_attn = self.slf_attn(
enc_input, enc_input, enc_input)
enc_output = self.pos_ffn(enc_output)
return enc_output, enc_slf_attn
MultiHeadAttention
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
''' Multi-Head Attention module '''
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
super().__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.w_qs = nn.Linear(d_model, n_head * d_k)
self.w_ks = nn.Linear(d_model, n_head * d_k)
self.w_vs = nn.Linear(d_model, n_head * d_v)
nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
self.layer_norm = nn.LayerNorm(d_model)
self.fc = nn.Linear(n_head * d_v, d_model)
nn.init.xavier_normal_(self.fc.weight)
self.dropout = nn.Dropout(dropout)
def forward(self, q, k, v):
d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
sz_b, len_q, _ = q.size() # sz_b, len_q = batch, n_multiv
sz_b, len_k, _ = k.size()
sz_b, len_v, _ = v.size()
residual = q # q: [batch, n_multiv, d_model]
q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) # q: [batch, n_multiv, n_head, d_k]
k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # q: [n_head * batch, n_multiv, d_k]
k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k)
v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v)
output, attn = self.attention(q, k, v) # output: [n_head * batch, n_multiv, d_v]
output = output.view(n_head, sz_b, len_q, d_v) # output: [n_head, batch, n_multiv, d_v]
output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # output: [batch, n_multiv, n_head * d_v]
output = self.dropout(self.fc(output))
output = self.layer_norm(output + residual)
return output, attn
class ScaledDotProductAttention(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self, temperature, attn_dropout=0.1):
super().__init__()
self.temperature = temperature
self.dropout = nn.Dropout(attn_dropout)
self.softmax = nn.Softmax(dim=2)
def forward(self, q, k, v):
attn = torch.bmm(q, k.transpose(1, 2))
attn = attn / self.temperature
attn = self.softmax(attn)
attn = self.dropout(attn)
output = torch.bmm(attn, v)
return output, attn
PositionwiseFeedForward
class PositionwiseFeedForward(nn.Module):
''' A two-feed-forward-layer module '''
def __init__(self, d_in, d_hid, dropout=0.1):
super().__init__()
self.w_1 = nn.Conv1d(d_in, d_hid, 1)
self.w_2 = nn.Conv1d(d_hid, d_in, 1)
self.layer_norm = nn.LayerNorm(d_in)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
residual = x
output = x.transpose(1, 2)
output = self.w_2(F.relu(self.w_1(output)))
output = output.transpose(1, 2)
output = self.dropout(output)
output = self.layer_norm(output + residual)
return output