Build a Transformer network structure and complete a small translation task by hand using Pytorch.
First, the structure of the Transformer is disassembled, the Transformer consists of an Encoder and a Decoder (Encoder-Decoder), the Encoder is stacked by a structure consisting of Multi-Head Attention + Feed-Forward Network, and the Decoder is stacked by a structure consisting of Multi-Head Attention + Multi-Head Attention + Feed-Forward Network.
class Encoder():
def __init__(self, corpus) -> None:
super().__init__()
self.src_emb = (len(corpus.src_vocab), d_embedding) # word embedding
self.pos_emb = .from_pretrained(get_sin_enc_table(corpus.src_len + 1, d_embedding), freeze=True) # position embedding
= ([EncoderLayer() for _ in range(encoder_n_layers)])
def forward(self, enc_inputs):
pos_indices = (1, enc_inputs.size(1)+1).unsqueeze(0).to(enc_inputs)
enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(pos_indices)
enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
enc_self_attn_weights = []
for layer in :
enc_outputs, enc_self_attn_weight = layer(enc_outputs, enc_self_attn_mask)
enc_self_attn_weights.append(enc_self_attn_weight)
return enc_outputs, enc_self_attn_weights
class Decoder():
def __init__(self, corpus) -> None:
super().__init__()
self.tgt_emb = (len(corpus.tgt_vocab), d_embedding) # word embedding
self.pos_emb = .from_pretrained(get_sin_enc_table(corpus.tgt_len + 1, d_embedding), freeze=True) # position embedding
= ([DecoderLayer() for _ in range(decoder_n_layers)])
def forward(self, dec_inputs, enc_inputs, enc_outputs):
pos_indices = (1, dec_inputs.size(1)+1).unsqueeze(0).to(dec_inputs)
dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(pos_indices)
# Generate a fill mask
dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
# Generating a follow-up mask
dec_self_attn_subsequent_mask= get_attn_subsequent_mask(dec_inputs)
# Integration Mask
dec_self_attn_mask = ((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs) # The self-attention mechanism only fills the mask,and is based onencodercap (a poem)decodergenerated by the input of the
dec_self_attn_weights = []
dec_enc_attn_weights = []
for layer in :
dec_outputs, dec_self_attn_weight, dec_enc_attn_weight = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
dec_self_attn_weights.append(dec_self_attn_weight)
dec_enc_attn_weights.append(dec_enc_attn_weight)
return dec_outputs, dec_self_attn_weights, dec_enc_attn_weights
class Transformer():
def __init__(self, corpus) -> None:
super().__init__()
= Encoder(corpus)
= Decoder(corpus)
= (d_embedding, len(corpus.tgt_vocab), bias=False)
def forward(self, enc_inputs, dec_inputs):
enc_outputs, enc_self_attn_weights = (enc_inputs)
dec_outputs, dec_self_attn_weights, dec_enc_attn_weights = (dec_inputs, enc_inputs, enc_outputs)
dec_logits = (dec_outputs)
return dec_logits, enc_self_attn_weights, dec_self_attn_weights, dec_enc_attn_weights
It's straightforward to see that there are two basic structures that need to be implemented to realize the Transformer: the Multi-Head Attention + Feed-Forward Network.
Multi-Head Attention
To realize the multi-attention mechanism, the first step is to realize the attention mechanism.
Attention's calculation:
- The input is linearly transformed to obtain the QKV matrix
- QK dot product, scaling, softmax
- and then perform a weighted summation of V
Multi-Head Attention is the one that contains multiple Attention heads:
- Multiple headers for concat
- Connect the fully-connected layers so that Multi-Head Attention gets the same output as the inputs
Let's walk through the computation of Multi-Head Attention by hand:
Assuming the length of the input sequence is n and the length of the encoding against each token is d, the input is (n, d)
Weight matrix: $ W_Q: (d, d_q), W_K: (d, d_q), W_V:(d, d_v)
$
- The obtained QKVs are $ Q: (n, d_q), K: (n, d_q), V:(n, d_v)
$ - Q is multiplied by the transpose of K: $ Q \cdot K^T : (n, d_q) \cdot (d_q, n) = (n, n) $, and the value of each point represents the similarity between the ith token and the jth token
- Scaling: doesn't change the size of the matrix, only the values in the matrix
- softmax: normalize the values in the matrix
- Do a weighted sum over V: $ softmax(\frac {Q \cdot K^T} {\sqrt{d_k}})\cdot V = (n, n)\cdot(n, d_v) = (n, d_v) $
- For a $ (n, d)\(of the input, the output obtained from a single head is \) (n, d_v) \(, the output obtained from a multiple concat is \) (n_{heads}, n, d_v) $
- transpose and do the fully-connection operation: $ (n_{heads}, n, d) -> (n, n_{heads}*d_v) -> (n, d) $
The code is implemented as follows:
class MultiHeadAttention():
def __init__(self) -> None:
super().__init__()
self.W_Q = (d_embedding, d_k * n_heads)
self.W_K = (d_embedding, d_k * n_heads)
self.W_V = (d_embedding, d_v * n_heads)
= (n_heads * d_v, d_embedding)
self.layer_norm = (d_embedding)
def forward(self, Q, K, V, attn_mask):
'''
Q: [batch, len_q, d_embedding]
K: [batch, len_k, d_embedding]
V: [batch, len_v, d_embedding]
attn_mask: [batch, len_q, len_k]
'''
residual, batch_size = Q, (0)
# step1: Linear transformation of the input + remodel
q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # [batch, n_heads, len_q, d_k]
k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # [batch, n_heads, len_k, d_k]
v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # [batch, n_heads, len_v, d_v]
# step2: Calculating Attention Scores, dot product (mathematics) + resizing
scores = (q_s, k_s.transpose(-1, -2)) / (d_k) # [batch_size, n_heads, len_q, len_k]
# step3: Use of Attention Masks, commander-in-chief (military)maskbe valued at1Replace the weights at
attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # [batch_size, n_heads, len_q, len_k]
scores.masked_fill_(attn_mask, -1e9)
# step4: Normalizing attention scores
weights = (dim=-1)(scores)
# step5: Compute the context vector,treat (sb a certain way)VPerform weighted summation
context = (weights, v_s) # [batch_size, n_heads, len_q, dim_v]
# step6: fc
context = (1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # [batch_size, len_q, n_heads * dim_v]
output = (context) # [batch_size, len_q, d_embedding]
# step7: layernorm
output = self.layer_norm(output + residual)
return output, weights
Feed-Forward Network
A Position-Wise Feed-Forward Network is connected behind each attention layer of the Encoder and Decoder, which serves to further extract features. This process is done independently at each position on the input sequence without disruption, integration or looping, hence the name Position-Wise Feed-Forward.
The formula is:
$ F(x) = max(0, W_1x+b_1)*W_2+b_2 $
The computational procedure is shown in the figure, using conv1/fc to first map the input sequence to higher dimensions (d_ff is an adjustable hyperparameter, typically 4 times d), and then downscaling the mapped sequence to the original dimension.
The implementation using conv1d is as follows
nn.Conv1d(in_channels, out_channels, kernel_size, ...)
$ (batch, n, d)-> (batch, d, n) -> (batch, d_ff, n) -> (batch, d, n) -> (batch, n, d) $
The first conv1d is parameterized:
nn.Conv1d(d, d_ff, 1, ...)
The second conv1d is parameterized:
nn.Conv1d(d_ff, d, 1, ...)
class PoswiseFeedForwardNet():
def __init__(self, d_ff=2048) -> None:
super().__init__()
# Define a one-dimensional convolutional layer,Mapping inputs to higher dimensions
self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)
# Define a one-dimensional convolutional layer,Mapping the input back to the original dimension
self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)
self.layer_norm = (d_embedding)
def forward(self, inputs):
'''
inputs: [batch_size, len_q, embedding_dim]
output: [batch_size, len_q, embedding_dim]
'''
residual = inputs
output = self.conv1((1, 2))
output = ()(output)
output = self.conv2(output)
output = self.layer_norm((1, 2) + residual)
return output
The implementation using fc is as follows
(in_features, out_features, bias=True)
$ (batch, n, d)-> (batch, n, d_ff) -> (batch, n, d) $
The first fc is parameterized:
(d, d_ff, bias=True)
The first fc is parameterized:
(d_ff, d, bias=True)
class PoswiseFeedForwardNet_fc():
def __init__(self, d_ff=2048) -> None:
super().__init__()
# Define a one-dimensional convolutional layer,Mapping inputs to higher dimensions
self.fc1 = (d_embedding, d_ff, bias=True)
self.fc2 = (d_ff, d_embedding, bias=True)
# self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)
# Define a one-dimensional convolutional layer,Mapping the input back to the original dimension
# self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)
self.layer_norm = (d_embedding)
def forward(self, inputs):
'''
inputs: [batch_size, len_q, embedding_dim]
output: [batch_size, len_q, embedding_dim]
'''
residual = inputs
output = self.fc1(inputs)
output = ()(output)
output = self.fc2(output)
output = self.layer_norm(output + residual)
return output
Reference Links:
GPT Illustration