Starting from:

$30

CS 447 HW4 - Using Attention for Neural Machine Translation

CS 447 HW4 - Using Attention for Neural Machine Translation
In this notebook we are going to perform machine translation using a deep learning based approach and attention mechanism.

Specifically, we are going to train a sequence to sequence model for Spanish to English translation. We will use Sequence to Sequence Models for this Assignment. In this assignment you only need tto implement the encoder and decoder, we implement all the data loading for you. Please refer to the following resources for more details:

https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
https://arxiv.org/pdf/1409.0473.pdf
from collections import defaultdict
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
import numpy as np
import pandas as pd
import pickle
import re
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import unicodedata

if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = torch.device('cpu')
Helper Functions
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """Normalizes latin chars with accent to their canonical decomposition"""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


# Preprocessing the sentence to add the start, end tokens and make them lower-case
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r'([?.!,¿])', r' \1 ', w)
    w = re.sub(r'[" "]+', ' ', w)

    w = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', w)
    
    w = w.rstrip().strip()
    w = '<start> ' + w + ' <end>'
    return w


def max_length(tensor):
    return max(len(t) for t in tensor)


def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded


def preprocess_data_to_tensor(dataframe, inp_lang, targ_lang):
    # Vectorize the input and target languages
    input_tensor = [[inp_lang.word2idx[s if s in inp_lang.vocab else '<unk>'] for s in es.split(' ')] for es in dataframe['es'].values.tolist()]
    target_tensor = [[targ_lang.word2idx[s if s in targ_lang.vocab else '<unk>'] for s in eng.split(' ')] for eng in dataframe['eng'].values.tolist()]

    # calculate the max_length of input and output tensor for padding
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    print('max_length_inp: {}, max_length_tar: {}'.format(max_length_inp, max_length_tar))

    # pad all the sentences in the dataset with the max_length
    input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
    target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]

    return input_tensor, target_tensor, max_length_inp, max_length_tar


def train_val_split(input_tensor, target_tensor):
    """Creating training and test/val sets using an 80-20 split"""
    total_num_examples = len(input_tensor)
    num_val = int(total_num_examples/5)
    num_train = total_num_examples - num_val
    
    input_tensor_train, input_tensor_val = input_tensor[:num_train], input_tensor[num_train:]
    target_tensor_train, target_tensor_val = target_tensor[:num_train], target_tensor[num_train:]

    assert len(input_tensor_train) == num_train
    assert len(target_tensor_train) == num_train
    assert len(input_tensor_val) == num_val
    assert len(target_tensor_val) == num_val

    return input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val


def sort_batch(X, y, lengths):
    """sort batch function to be able to use with pad_packed_sequence"""
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]
    return X.transpose(0,1), y, lengths # transpose (batch x seq) to (seq x batch)
Download The Data
Here we will download the translation data. We will learn a model to translate Spanish to English.

NOTE: Comment all lines in the below cell before submitting your code. These statement is only valid in a Notebook file and not in a Python .py file

# NOTE: Comment all 3 lines in the below cell before submitting your code to gradescope.
if __name__ == '__main__':
    !wget http://www.manythings.org/anki/spa-eng.zip
    !unzip -o spa-eng.zip
if __name__ == '__main__':
    lines = open('spa.txt', encoding='UTF-8').read().strip().split('\n')
    total_num_examples = 30000 
    original_word_pairs = [[w for w in l.split('\t')][:2] for l in lines[:total_num_examples]]
    data = pd.DataFrame(original_word_pairs, columns=['eng', 'es'])
    print(data) # visualizing the data
if __name__ == '__main__':
    # Make sure YOU only run this once - if you run it twice it will mess up the data so you will have run the above cell again
    # Now we do the preprocessing using pandas and lambdas
    data['eng'] = data.eng.apply(lambda w: preprocess_sentence(w))
    data['es'] = data.es.apply(lambda w: preprocess_sentence(w))
    print(data) # visualizing the data
Vocabulary Class
We create a class here for managing our vocabulary as we did in HW2. In this HW, we have a separate class for the vocabulary as we need 2 different vocabularies - one for English and one for Spanish.

class Vocab_Lang():
    def __init__(self, vocab):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.vocab = vocab
        
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 2 # +2 because of <pad> and <unk> token
            self.idx2word[index + 2] = word
Dataloader for our Encoder and Decoder
We prepare the dataloader and make sure the dataloader returns the source sentence, target sentence and the length of the source sentenc sampled from the training dataset.

class MyData(Dataset):
    def __init__(self, X, y):
        self.length = torch.LongTensor([np.sum(1 - np.equal(x, 0)) for x in X])
        self.data = torch.LongTensor(X)
        self.target = torch.LongTensor(y)
    
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x, y, x_len
  
    def __len__(self):
        return len(self.data)
Train your model
You will train your model here.

Pass the source sentence and their corresponding lengths into the encoder
Creating the decoder input using tokens
Now we find out the decoder outputs conditioned on the previous predicted word usually, but in our training we use teacher forcing. Read more about teacher forcing at https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/
Evaluation
We evaluate on the test set.
In this evaluation, instead of using the concept of teacher forcing, we use the prediction of the decoder as the input to the decoder for the sequence of outputs.
Bleu Score Calculation for evaluation
Read more about Bleu Score at :

https://en.wikipedia.org/wiki/BLEU
https://www.aclweb.org/anthology/P02-1040.pdf
def loss_function(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    mask = real.ge(1).float()
    
    loss_ = F.cross_entropy(pred, real) * mask 
    return torch.mean(loss_)
def train_model(encoder, decoder, dataset, optimizer, targ_lang, device, n_epochs=10):
    for epoch in range(n_epochs):
        start = time.time()
        n_batch = 0
        total_loss = 0
        
        encoder.train()
        decoder.train()
        
        for inp, targ, inp_len in tqdm(dataset):
            n_batch += 1
            loss = 0
            
            xs, ys, lens = sort_batch(inp, targ, inp_len)
            enc_output, enc_hidden = encoder(xs.to(device), lens)
            dec_hidden = enc_hidden
            
            # use teacher forcing - feeding the target as the next input (via dec_input)
            dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
        
            # run code below for every timestep in the ys batch
            for t in range(1, ys.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
                loss += loss_function(ys[:, t].to(device), predictions.to(device))
                dec_input = ys[:, t].unsqueeze(1)
        
            batch_loss = (loss / int(ys.size(1)))
            total_loss += batch_loss
            
            optimizer.zero_grad()
            
            loss.backward()

            ### update model parameters
            optimizer.step()
        
        ### TODO: Save checkpoint for model (optional)
        print('Epoch:{:2d}/{}\t Loss:{:.4f} ({:.2f}s)'.format(epoch + 1, n_epochs, total_loss / n_batch, time.time() - start))

    print('Model trained!')
def get_reference_candidate(target, pred, targ_lang):
    reference = list(target)
    reference = [targ_lang.idx2word[s] for s in np.array(reference[1:])]
    candidate = list(pred)
    candidate = [targ_lang.idx2word[s] for s in np.array(candidate[1:])]
    return reference, candidate


def evaluate_model(encoder, decoder, val_dataset, target_tensor_val, max_length_tar, targ_lang, device):
    batch_size = val_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()

    final_output = torch.zeros((len(target_tensor_val), max_length_tar))
    target_output = torch.zeros((len(target_tensor_val), max_length_tar))

    with torch.no_grad():
        for batch, (inp, targ, inp_len) in enumerate(val_dataset):
            n_batch += 1
            loss = 0
            xs, ys, lens = sort_batch(inp, targ, inp_len)
            enc_output, enc_hidden = encoder(xs.to(device), lens)
            dec_hidden = enc_hidden
            
            dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * batch_size)
            curr_output = torch.zeros((ys.size(0), ys.size(1)))
            curr_output[:, 0] = dec_input.squeeze(1)

            for t in range(1, ys.size(1)): # run code below for every timestep in the ys batch
                predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
                loss += loss_function(ys[:, t].to(device), predictions.to(device))
                dec_input = torch.argmax(predictions, dim=1).unsqueeze(1)
                curr_output[:, t] = dec_input.squeeze(1)
        
            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = targ
            batch_loss = (loss / int(ys.size(1)))
            total_loss += batch_loss

        print('Loss {:.4f}'.format(total_loss / n_batch))
    
    # Compute Bleu scores
    bleu_1 = 0.0
    bleu_2 = 0.0
    bleu_3 = 0.0
    bleu_4 = 0.0

    smoother = SmoothingFunction()
    save_reference = []
    save_candidate = []
    for i in range(len(target_tensor_val)):
        reference, candidate = get_reference_candidate(target_output[i], final_output[i], targ_lang)
    
        bleu_1 += sentence_bleu(reference, candidate, weights=(1,), smoothing_function=smoother.method1)
        bleu_2 += sentence_bleu(reference, candidate, weights=(1/2, 1/2), smoothing_function=smoother.method1)
        bleu_3 += sentence_bleu(reference, candidate, weights=(1/3, 1/3, 1/3), smoothing_function=smoother.method1)
        bleu_4 += sentence_bleu(reference, candidate, weights=(1/4, 1/4, 1/4, 1/4), smoothing_function=smoother.method1)

        save_reference.append(reference)
        save_candidate.append(candidate)
    
    bleu_1 = bleu_1/len(target_tensor_val)
    bleu_2 = bleu_2/len(target_tensor_val)
    bleu_3 = bleu_3/len(target_tensor_val)
    bleu_4 = bleu_4/len(target_tensor_val)

    # bleu_1 = corpus_bleu(save_reference, save_candidate, weights=(1,), smoothing_function=smoother.method1)
    # bleu_2 = corpus_bleu(save_reference, save_candidate, weights=(1/2, 1/2), smoothing_function=smoother.method1)
    # bleu_3 = corpus_bleu(save_reference, save_candidate, weights=(1/3, 1/3, 1/3), smoothing_function=smoother.method1)
    # bleu_4 = corpus_bleu(save_reference, save_candidate, weights=(1/4, 1/4, 1/4, 1/4), smoothing_function=smoother.method1)

    print('BLEU 1-gram: %f' % (bleu_1))
    print('BLEU 2-gram: %f' % (bleu_2))
    print('BLEU 3-gram: %f' % (bleu_3))
    print('BLEU 4-gram: %f' % (bleu_4))

    return save_candidate
Use Pretrained Embeddings
The embedding used in HW2 was initialized with random vectors and learnt while training. Here we will use the FastText embedding method proposed by Facebook's AI Research lab to improve our translation result. Particularly, we will use an implementation from the gensim library to train the embedding of our corpus.

Read more about FastText and gensim liberary: https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText

from gensim.models import FastText
Train FastText Embeddings (Implement This)
def compute_FastText_embeddings(pd_dataframe, embedding_dim=256):
    """
    Given dataset (pd.DataFrame as used in the beginning), train FastText embeddings
    Return FastText trained model and embeddings vectors (np array [2 + vocab_size, embedding_dim])
    """
    print('Computing FastText Embeddings...')
    sentences = [sen.split() for sen in pd_dataframe]
    
    ##### TODO:
    # 1. create FastText model to learn `embedding_dim` sized embedding vectors
    model = FastText(size=embedding_dim)

    # 2. build vocab from sentences
    model.build_vocab(sentences=sentences)

    # 3. train model on sentences for 10 epochs
    model.train(sentences=sentences,total_examples=len(sentences),epochs=10)
    

    # 4. The sentences that we used to train the embedding don't contain '<pad>', or '<unk>' 
    # so add two all-zero or random rows in the beginning of the embedding numpy array for '<pad>' and '<unk>'
    embedding_vec = model.wv.vectors # np.array [vocab_size, embedding_dim]
    embedding_vec = np.insert(embedding_vec, [0], [[0], [0]], axis=0) # np.array [2 + vocab_size, embedding_dim]

    return model, embedding_vec
if __name__ == '__main__':
    # HYPERPARAMETERS (No need to experiemnt with other hyperparameters as these seem to work fine)
    BATCH_SIZE = 64
    EMBEDDING_DIM = 256
    UNITS = 512
    LEARNING_RATE = 0.001
if __name__ == '__main__':
    model_eng, embedding_eng = compute_FastText_embeddings(data['eng'], EMBEDDING_DIM)
    model_es, embedding_es = compute_FastText_embeddings(data['es'], EMBEDDING_DIM)
if __name__ == '__main__':
    inp_lang = Vocab_Lang(model_es.wv.vocab)
    targ_lang = Vocab_Lang(model_eng.wv.vocab)

    input_tensor, target_tensor, max_length_inp, max_length_tar = preprocess_data_to_tensor(data, inp_lang, targ_lang)
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_val_split(input_tensor, target_tensor)

    # create train and val datasets
    train_dataset = MyData(input_tensor_train, target_tensor_train)
    train_dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)

    val_dataset = MyData(input_tensor_val, target_tensor_val)
    val_dataset = DataLoader(val_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=False)
if __name__ == '__main__':
    inp, targ, inp_len = train_dataset.dataset[0]
    print('Input:', inp)
    print('Target:', targ)
    print('Input Length:', inp_len)
Encoder Model (Implement This)
First we build a simple encoder model, which will be very similar to what you did in MP2. But instead of using a fully connected layer as the output, you should the return the output of your recurrent net (GRU/LSTM) as well as the hidden output. They are used in the decoder later.

NOTE: Initialize your embedding layer with those embeddings. Refer to nn.Embedding.from_pretrained in https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

class Encoder(nn.Module):
    def __init__(self, pretrained_emb, vocab_size, embedding_dim, enc_units, num_layers=1):
        super(Encoder, self).__init__()
        ##### TODO:
        # Convert pretrained_emb from np.array to torch.FloatTensor
        self.pretrained_emb = torch.from_numpy(pretrained_emb)
        self.vocab_size = vocab_size

        # Initialize embedding layer with pretrained_emb
        self.embedding = nn.Embedding.from_pretrained(self.pretrained_emb)
        
        # Initialize a single directional LSTM/GRU with 1 layers and batch_first=False
        self.rnn = nn.LSTM(embedding_dim, enc_units, num_layers)
        
    def forward(self, x, lens):
        """
        Args:
            x: [max_len, batch_size]
            lens: [batch_size]

        Returns:
            unpacked_output: [max_len, batch_size, enc_units]
            hidden_state: [1, batch_size, enc_units]
        
        Pseudo-code:
        - Pass x through an embedding layer
        - Make sure x is correctly packed before the recurrent net 
        - Pass it through the recurrent net
        - Make sure the output is unpacked correctly
        - Return hidden states from the recurrent net (for last time step) and the unpacked output
        """
        x = self.embedding(x) # [max_len, batch_size, embedding_dim]
        
        ##### TODO:
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lens)
        output, (hidden_state, c_n) = self.rnn(packed_x)
        unpacked_output = nn.utils.rnn.pad_packed_sequence(output)[0] 
        return unpacked_output, hidden_state
Decoder Model (Implement This)
We will implement a Decoder model which uses an attention mechanism. We will implement the decoder as provided in https://arxiv.org/pdf/1409.0473.pdf. Please read the links provided above first, at the start of this assignment for review. The pseudo-code for your implementation should be somewhat as follows:

The input is put through an encoder model which gives us the encoder output of shape (max_length, batch_size, enc_units) and the encoder hidden state of shape (1, batch_size, enc_units).
Using the output your encoder you will calculate the score and subsequently the attention using following equations :attention equation 0attention equation 1

Once you have calculated this attention vector, you pass the original input x through a embedding layer. The output of this embedding layer is concatenated with the attention vector which is passed into your RNN.

Finally you pass the output of your RNN into a fully connected layer with an output size same as that vocab, to see the probability of the most possible word.

NOTE: Initialize your embedding layer with those embeddings. Refer to nn.Embedding.from_pretrained in https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

class Decoder(nn.Module):
    def __init__(self, pretrained_emb, vocab_size, embedding_dim, dec_units, enc_units, num_layers=1):
        super(Decoder, self).__init__()

        ##### TODO:
        # Convert pretrained_emb from np.array to torch.FloatTensor
        self.pretrained_emb = torch.from_numpy(pretrained_emb)

        # Initialize embedding layer with pretrained_emb
        self.embedding = nn.Embedding.from_pretrained(self.pretrained_emb)
        
        # Layers to compute score based on the formula you pick
        # We have tested the model using Bahdanau's additive style formula
        #   but feel free to try the Loung's style also and see if you can get better performance
        self.tanh = nn.Tanh()
        self.w1 = nn.Linear(enc_units, enc_units)
        self.w2 = nn.Linear(enc_units, enc_units)
        self.V = nn.Linear(enc_units, 1)
        self.softmax = nn.Softmax(dim=1)
        
        # Initialize a single directional LSTM/GRU with 1 layers and batch_first=True
        # NOTE: input to your rnn would be a concatenation of two types of vectors
        self.rnn = nn.LSTM(embedding_dim+enc_units, dec_units, num_layers, batch_first=True)
        
        # Fully Connected Layer
        self.fc = nn.Linear(dec_units, vocab_size)
    
    def forward(self, x, hidden, enc_output):
        """
        Args:
            x: [batch_size, 1]
            hidden: [1, batch_size, enc_units]
            enc_output: [max_len, batch_size, enc_units]

        Returns:
            fc_out: [batch_size, vocab_size]
            hidden_state [1, batch_size, dec_units]
            attention_weights: [batch_size, max_len, 1]

        Pseudo-code:
        - Calculate the score using the formula shown above using encoder output and hidden output. 
            Note h_t is the hidden output of the decoder and h_s is the encoder output in the formula
        - Calculate the attention weights using softmax and passing through V - which can be implemented as a fully connected layer
        - Finally find c_t which is a context vector where the shape of context_vector should be (batch_size, hidden_size)
        - You need to unsqueeze the context_vector for concatenating with x_embedding as listed in Point 3 above
        - Pass this concatenated tensor to the RNN and follow as specified in Point 4 above
        - Return (i)  output of your fc layer (takes output of your RNN as its input (might have to reshape it first))
        -        (ii) hidden states from the recurrent net (for last time step)
        -        (ii) attention weights
        """
        hidden = hidden.permute(1, 0, 2) # [batch_size, 1, enc_units]
        enc_output = enc_output.permute(1, 0, 2) # [batch_size, max_len, enc_units]

        ##### TODO:
        score = self.V(self.tanh(self.w1(hidden)+self.w2(enc_output))) # [batch_size, max_len, 1]
        attention_weights = self.softmax(score)      
        context_vector = torch.sum(attention_weights*enc_output, 1) # [batch_size, enc_units]
        c_t = torch.unsqueeze(context_vector, 1) # [batch_size, 1, enc_units]
        x_embedding = self.embedding(x)
        cx = torch.cat((c_t, x_embedding), 2)
        output, (hidden_state, _) = self.rnn(cx)
        output = torch.squeeze(output)
        fc_out = self.fc(output)

        return fc_out, hidden_state, attention_weights
if __name__ == '__main__':
    vocab_inp_size = len(inp_lang.word2idx)
    vocab_tar_size = len(targ_lang.word2idx)

    encoder = Encoder(embedding_es, vocab_inp_size, EMBEDDING_DIM, UNITS).to(device)
    decoder = Decoder(embedding_eng, vocab_tar_size, EMBEDDING_DIM, UNITS, UNITS).to(device)

    model_params = list(encoder.parameters()) + list(decoder.parameters())
    optimizer = torch.optim.Adam(model_params, lr=LEARNING_RATE)

    print('Encoder and Decoder models initialized!')
Train and evaluate your model here
We expect your scores to be in the range of for full credit for each of the 4 BLEU scores individually. No partial credit :(

BLEU-1 > 0.145
BLEU-2 > 0.030
BLEU-3 > 0.020
BLEU-4 > 0.015
if __name__ == '__main__':
    train_model(encoder, decoder, train_dataset, optimizer, targ_lang, device)
if __name__ == '__main__':
    save_candidate = evaluate_model(encoder, decoder, val_dataset, target_tensor_val, max_length_tar, targ_lang, device)
Save results to your Google Drive for Submission
You need to submit this results.pkl file to the autograder.

if __name__ == '__main__':
    from google.colab import drive
    drive.mount('/content/drive')

    pickle.dump(save_candidate, open('drive/My Drive/results.pkl', 'wb'))
    print('results.pkl saved to your Google Drive')

More products