Data Description
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Setting the current working directory
import os; os.chdir('drive/My Drive/Great Learning/NLP')
# Import packages
import pandas as pd, numpy as np
import tensorflow as tf
assert tf.__version__ >= '2.0'
from itertools import islice
# Keras
from keras.layers import Dense, Embedding, LSTM, Dropout, MaxPooling1D, Conv1D
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.preprocessing import sequence
from keras.datasets import imdb
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Suppress warnings
import warnings; warnings.filterwarnings('ignore')
random_state = 42
np.random.seed(random_state)
tf.random.set_seed(random_state)
vocab_size = 10000
maxlen = 300
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = vocab_size)
x_train = pad_sequences(x_train, maxlen = maxlen, padding = 'pre')
x_test = pad_sequences(x_test, maxlen = maxlen, padding = 'pre')
X = np.concatenate((x_train, x_test), axis = 0)
y = np.concatenate((y_train, y_test), axis = 0)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state, shuffle = True)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2, random_state = random_state, shuffle = True)
print('---'*20, f'\nNumber of rows in training dataset: {x_train.shape[0]}')
print(f'Number of columns in training dataset: {x_train.shape[1]}')
print(f'Number of unique words in training dataset: {len(np.unique(np.hstack(x_train)))}')
print('---'*20, f'\nNumber of rows in validation dataset: {x_valid.shape[0]}')
print(f'Number of columns in validation dataset: {x_valid.shape[1]}')
print(f'Number of unique words in validation dataset: {len(np.unique(np.hstack(x_valid)))}')
print('---'*20, f'\nNumber of rows in test dataset: {x_test.shape[0]}')
print(f'Number of columns in test dataset: {x_test.shape[1]}')
print(f'Number of unique words in test dataset: {len(np.unique(np.hstack(x_test)))}')
print('---'*20, f'\nUnique Categories: {np.unique(y_train), np.unique(y_valid), np.unique(y_test)}')
def decode_review(x, y):
w2i = imdb.get_word_index()
w2i = {k:(v + 3) for k, v in w2i.items()}
w2i['<PAD>'] = 0
w2i['<START>'] = 1
w2i['<UNK>'] = 2
i2w = {i: w for w, i in w2i.items()}
ws = (' '.join(i2w[i] for i in x))
print(f'Review: {ws}')
print(f'Actual Sentiment: {y}')
return w2i, i2w
w2i, i2w = decode_review(x_train[0], y_train[0])
# get first 50 key, value pairs from id to word dictionary
print('---'*30, '\n', list(islice(i2w.items(), 0, 50)))
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:
The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn LabelEncoder.
# Model
model = Sequential()
model.add(Embedding(vocab_size, 256, input_length = maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(256, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(Conv1D(128, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(MaxPooling1D(pool_size = 2))
model.add(Conv1D(64, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(MaxPooling1D(pool_size = 2))
model.add(LSTM(75))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())
# Adding callbacks
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 0)
mc = ModelCheckpoint('imdb_model.h5', monitor = 'val_loss', mode = 'min', save_best_only = True, verbose = 1)
# Fit the model
model.fit(x_train, y_train, validation_data = (x_valid, y_valid), epochs = 3, batch_size = 64, verbose = True, callbacks = [es, mc])
# Evaluate the model
scores = model.evaluate(x_test, y_test, batch_size = 64)
print('Test accuracy: %.2f%%' % (scores[1]*100))
y_pred = model.predict_classes(x_test)
print(f'Classification Report:\n{classification_report(y_pred, y_test)}')
sample_x_test = x_test[np.random.randint(10000)]
for layer in model.layers:
model_layer = Model(inputs = model.input, outputs = model.get_layer(layer.name).output)
output = model_layer.predict(sample_x_test.reshape(1,-1))
print('\n','--'*20, layer.name, 'layer', '--'*20, '\n')
print(output)
decode_review(x_test[10], y_test[10])
print(f'Predicted sentiment: {y_pred[10][0]}')