A Pytorch implementation of Transformer [1].

Build a Transformer network structure and complete a small translation task by hand using Pytorch.

First, the structure of the Transformer is disassembled, the Transformer consists of an Encoder and a Decoder (Encoder-Decoder), the Encoder is stacked by a structure consisting of Multi-Head Attention + Feed-Forward Network, and the Decoder is stacked by a structure consisting of Multi-Head Attention + Multi-Head Attention + Feed-Forward Network.

class Encoder():
    def __init__(self, corpus) -> None:
        super().__init__()
        self.src_emb = (len(corpus.src_vocab), d_embedding) # word embedding
        self.pos_emb = .from_pretrained(get_sin_enc_table(corpus.src_len + 1, d_embedding), freeze=True) # position embedding
         = ([EncoderLayer() for _ in range(encoder_n_layers)])
    
    def forward(self, enc_inputs):
        pos_indices = (1, enc_inputs.size(1)+1).unsqueeze(0).to(enc_inputs)
        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(pos_indices)
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
        enc_self_attn_weights = []
        for layer in :
            enc_outputs, enc_self_attn_weight = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attn_weights.append(enc_self_attn_weight)
        return enc_outputs, enc_self_attn_weights

class Decoder():
    def __init__(self, corpus) -> None:
        super().__init__()
        self.tgt_emb = (len(corpus.tgt_vocab), d_embedding) # word embedding
        self.pos_emb = .from_pretrained(get_sin_enc_table(corpus.tgt_len + 1, d_embedding), freeze=True) # position embedding
         = ([DecoderLayer() for _ in range(decoder_n_layers)])
    
    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        pos_indices = (1, dec_inputs.size(1)+1).unsqueeze(0).to(dec_inputs)
        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(pos_indices)
        # Generate a fill mask
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
        # Generating a follow-up mask
        dec_self_attn_subsequent_mask= get_attn_subsequent_mask(dec_inputs)
        # Integration Mask
        dec_self_attn_mask = ((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs) # The self-attention mechanism only fills the mask，and is based onencodercap (a poem)decodergenerated by the input of the

        dec_self_attn_weights = []
        dec_enc_attn_weights = []
        for layer in :
            dec_outputs, dec_self_attn_weight, dec_enc_attn_weight = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attn_weights.append(dec_self_attn_weight)
            dec_enc_attn_weights.append(dec_enc_attn_weight)
        return dec_outputs, dec_self_attn_weights, dec_enc_attn_weights

class Transformer():
    def __init__(self, corpus) -> None:
        super().__init__()
         = Encoder(corpus)
         = Decoder(corpus)
         = (d_embedding, len(corpus.tgt_vocab), bias=False)
    
    def forward(self, enc_inputs, dec_inputs):
        enc_outputs, enc_self_attn_weights = (enc_inputs)
        dec_outputs, dec_self_attn_weights, dec_enc_attn_weights = (dec_inputs, enc_inputs, enc_outputs)
        dec_logits = (dec_outputs)
        return dec_logits, enc_self_attn_weights, dec_self_attn_weights, dec_enc_attn_weights

It's straightforward to see that there are two basic structures that need to be implemented to realize the Transformer: the Multi-Head Attention + Feed-Forward Network.

Multi-Head Attention

To realize the multi-attention mechanism, the first step is to realize the attention mechanism.

Attention's calculation:

The input is linearly transformed to obtain the QKV matrix
QK dot product, scaling, softmax
and then perform a weighted summation of V

Multi-Head Attention is the one that contains multiple Attention heads:

Multiple headers for concat
Connect the fully-connected layers so that Multi-Head Attention gets the same output as the inputs

Let's walk through the computation of Multi-Head Attention by hand:

Assuming the length of the input sequence is n and the length of the encoding against each token is d, the input is (n, d)

Weight matrix: $ W_Q: (d, d_q), W_K: (d, d_q), W_V:(d, d_v)
$

The obtained QKVs are $ Q: (n, d_q), K: (n, d_q), V:(n, d_v)
$
Q is multiplied by the transpose of K: $ Q \cdot K^T : (n, d_q) \cdot (d_q, n) = (n, n) $, and the value of each point represents the similarity between the ith token and the jth token
Scaling: doesn't change the size of the matrix, only the values in the matrix
softmax: normalize the values in the matrix
Do a weighted sum over V: $ softmax(\frac {Q \cdot K^T} {\sqrt{d_k}})\cdot V = (n, n)\cdot(n, d_v) = (n, d_v) $
For a $ (n, d)$of the input, the output obtained from a single head is $ (n, d_v) $, the output obtained from a multiple concat is $ (n_{heads}, n, d_v) $
transpose and do the fully-connection operation: $ (n_{heads}, n, d) -> (n, n_{heads}*d_v) -> (n, d) $

The code is implemented as follows:

class MultiHeadAttention():
    def __init__(self) -> None:
        super().__init__()
        self.W_Q = (d_embedding, d_k * n_heads)
        self.W_K = (d_embedding, d_k * n_heads)
        self.W_V = (d_embedding, d_v * n_heads)
         = (n_heads * d_v, d_embedding)
        self.layer_norm = (d_embedding)
    
    def forward(self, Q, K, V, attn_mask):
        '''
            Q: [batch, len_q, d_embedding]
            K: [batch, len_k, d_embedding]
            V: [batch, len_v, d_embedding]
            attn_mask: [batch, len_q, len_k]
        '''
        residual, batch_size = Q, (0)
        # step1: Linear transformation of the input + remodel
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # [batch, n_heads, len_q, d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # [batch, n_heads, len_k, d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # [batch, n_heads, len_v, d_v]
        # step2: Calculating Attention Scores, dot product (mathematics) + resizing
        scores = (q_s, k_s.transpose(-1, -2)) / (d_k) # [batch_size, n_heads, len_q, len_k]
        # step3: Use of Attention Masks, commander-in-chief (military)maskbe valued at1Replace the weights at
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # [batch_size, n_heads, len_q, len_k]
        scores.masked_fill_(attn_mask, -1e9)
        # step4: Normalizing attention scores
        weights = (dim=-1)(scores)
        # step5: Compute the context vector，treat (sb a certain way)VPerform weighted summation
        context = (weights, v_s) # [batch_size, n_heads, len_q, dim_v]
        # step6: fc
        context = (1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # [batch_size, len_q, n_heads * dim_v]
        output = (context) # [batch_size, len_q, d_embedding]
        # step7: layernorm
        output = self.layer_norm(output + residual)
        return output, weights

Feed-Forward Network

A Position-Wise Feed-Forward Network is connected behind each attention layer of the Encoder and Decoder, which serves to further extract features. This process is done independently at each position on the input sequence without disruption, integration or looping, hence the name Position-Wise Feed-Forward.

The formula is:

$ F(x) = max(0, W_1x+b_1)*W_2+b_2 $

The computational procedure is shown in the figure, using conv1/fc to first map the input sequence to higher dimensions (d_ff is an adjustable hyperparameter, typically 4 times d), and then downscaling the mapped sequence to the original dimension.

The implementation using conv1d is as follows

nn.Conv1d(in_channels, out_channels, kernel_size, ...)

$ （batch, n, d）-> (batch, d, n) -> (batch, d_ff, n) -> (batch, d, n) -> (batch, n, d) $

The first conv1d is parameterized:

nn.Conv1d(d, d_ff, 1, ...)

The second conv1d is parameterized:

nn.Conv1d(d_ff, d, 1, ...)

class PoswiseFeedForwardNet():
    def __init__(self, d_ff=2048) -> None:
        super().__init__()
        # Define a one-dimensional convolutional layer，Mapping inputs to higher dimensions
        self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)
        # Define a one-dimensional convolutional layer，Mapping the input back to the original dimension
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)
        self.layer_norm = (d_embedding)

    def forward(self, inputs):
        '''
            inputs: [batch_size, len_q, embedding_dim]
            output: [batch_size, len_q, embedding_dim]
        '''
        residual = inputs
        output = self.conv1((1, 2))
        output = ()(output)
        output = self.conv2(output)
        output = self.layer_norm((1, 2) + residual)
        return output

The implementation using fc is as follows

(in_features, out_features, bias=True)

$ （batch, n, d）-> (batch, n, d_ff) -> (batch, n, d) $

The first fc is parameterized:

(d, d_ff, bias=True)

The first fc is parameterized:

(d_ff, d, bias=True)

class PoswiseFeedForwardNet_fc():
    def __init__(self, d_ff=2048) -> None:
        super().__init__()
        # Define a one-dimensional convolutional layer，Mapping inputs to higher dimensions
        self.fc1 = (d_embedding, d_ff, bias=True)
        self.fc2 = (d_ff, d_embedding, bias=True)
        # self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)
        # Define a one-dimensional convolutional layer，Mapping the input back to the original dimension
        # self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)
        self.layer_norm = (d_embedding)

    def forward(self, inputs):
        '''
            inputs: [batch_size, len_q, embedding_dim]
            output: [batch_size, len_q, embedding_dim]
        '''
        residual = inputs
        output = self.fc1(inputs)
        output = ()(output)
        output = self.fc2(output)
        output = self.layer_norm(output + residual)
        return output

Reference Links:

GPT Illustration