Dataset

Yelp reviews about restaurants.
1- and 2-star ratings are classified as “negative”
3- and 4-star ratings are are “positive”
560K training samples, 38K testing samples. We are using 10% here.

import numpy as np
import pandas as pd

data_root = "../../PyTorchNLPBook/data/yelp"

df = pd.read_csv(data_root + "/reviews_with_splits_lite.csv")
df

	rating	review	split
0	negative	terrible place to work for i just heard a stor...	train
1	negative	hours , minutes total time for an extremely s...	train
2	negative	my less than stellar review is for service . w...	train
3	negative	i m granting one star because there s no way t...	train
4	negative	the food here is mediocre at best . i went aft...	train
...	...	...	...
55995	positive	great food . wonderful , friendly service . i ...	test
55996	positive	charlotte should be the new standard for moder...	test
55997	positive	get the encore sandwich ! ! make sure to get i...	test
55998	positive	i m a pretty big ice cream gelato fan . pretty...	test
55999	positive	where else can you find all the parts and piec...	test

56000 rows × 3 columns

The dataset has been separated into ‘test’, ‘train’, and ‘val’ splits.

df.split.unique()

array(['train', 'val', 'test'], dtype=object)

The review column text has been processed to - make the text lowercase - adding spaces before and after the punctuations - replacing all other symbols with spaces

from torch.utils.data import Dataset

class Vocabulary(object):
    def __init__(self, token_to_idx=None, add_unk=True, unk_token='<UNK>'):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        # create the inverse mapping
        self._idx_to_token = { idx : token for token, idx in \
                                    self._token_to_idx.items() }

        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def add_token(self, token):
        """ Add a token to mappings and return it's index. """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        """ Get the index corresponding to a token """
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """ Get the token corresponding to an index. """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" %index)
        return self._idx_to_token[index]

    def __len__(self):
        return len(self._token_to_idx)

    def __str__(self):
        return "<Vocabulary(size=%d)>" %len(self)

    def to_serializable(self):
        return {
            'token_to_idx': self._token_to_idx,
            'add_unk': self._add_unk,
            'unk_token': self._unk_token
        }

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

import numpy as np
import string
from collections import Counter

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

class ReviewVectorizer(object):
    def __init__(self, review_vocab, rating_vocab):
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review):
        """ Returns a collapsed one-hot vector for a given review. """
        one_hot = np.zeros(len(self.review_vocab), dtype = np.float32)
        for token in review.split(" "):
            if token not in string.punctuation:
                index = self.review_vocab.lookup_token(token)
                one_hot[index] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

    def to_serializable(self):
        return {
            'review_vocab': self.review_vocab.to_serializable(),
            'rating_vocab': self.rating_vocab.to_serializable()
        }

    @classmethod
    def from_serializable(cls, contents):
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        return cls(review_vocab, rating_vocab)

class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        self.review_df = review_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df.split == 'train']
        self.val_df = self.review_df[self.review_df.split == 'val']
        self.test_df = self.review_df[self.review_df.split == 'test']

        self.train_size = len(self.train_df)
        self.val_size = len(self.val_df)
        self.test_size = len(self.test_df)

        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val': (self.val_df, self.val_size),
            'test': (self.test_df, self.test_size)
        }

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """Load dataset and make a new vectorizer from scratch.

        Args:
            review_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """

        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split='train'):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        review_vector = \
            self._vectorizer.vectorize(row.review)
        rating_index = \
            self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {'x_data': review_vector,
                'y_target': rating_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last = True, device="cpu"):
    """
    A minibatch generator function which wraps the PyTorch DataLoader.
    It will ensure each tensor is on the right device.
    """
    dataloader = DataLoader(dataset=dataset,
                         batch_size=batch_size,
                         shuffle=shuffle,
                         drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

import torch.nn as nn
import torch.nn.functional as F

class ReviewClassifier(nn.Module):
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fcl = nn.Linear(in_features=num_features,
                            out_features=1)

    def forward(self, x_in, apply_sigmoid=False):
        y_out = self.fcl(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

from argparse import Namespace

args = Namespace(
    # Data and path information
    frequency_cutoff = 25,
    model_state_file = 'model.pth',
    review_csv = data_root + "/reviews_with_splits_lite.csv",
    save_dir = ".",
    # No model hyperparams
    # Training hypereparams
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=15,
    seed=1337,
    cuda=True
)

import torch.optim as optim

def make_train_state(args):
    return {
        'epoch_index': 0,
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': [],
        'test_loss': -1,
        'test_acc': -1,
    }

train_state = make_train_state(args)

import torch

if not torch.cuda.is_available():
    args.cuda = False

args.cuda

True

args.device = torch.device("cuda" if args.cuda else "cpu")

args.device

device(type='cuda')

dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
vectorizer = dataset.get_vectorizer()

classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(args.device)

loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr = args.learning_rate)

def compute_accuracy(pred, target):
    return ((target - pred) / target).sum()

compute_accuracy(np.zeros(10), np.ones(10))

10.0

# A bare-bones training loop
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index

    # Iterate over training dataset
    dataset.set_split('train')
    batch_generator = generate_batches(dataset,
                                    batch_size=args.batch_size,
                                    device=args.device)

    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # zero the gradients
        optimizer.zero_grad()

        # compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())

        # compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # compute gradients
        loss.backward()

        # update params
        optimizer.step()

        # compute accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)


    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # Iterate over validation dataset
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size,
                                        device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())

        # compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)


    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

KeyboardInterrupt:

train_state

{'epoch_index': 22,
 'train_loss': [0.48078572292343463,
  0.32938406645667323,
  0.27444523850492386,
  0.24338128739127932,
  0.22270150414479334,
  0.20760727328023101,
  0.19577689098766424,
  0.18615816270603858,
  0.17823111704167202,
  0.17131327121865514,
  0.1652598268309839,
  0.16008958478573887,
  0.1552568533354335,
  0.15096716351466236,
  0.14698867341564376,
  0.1434966386874128,
  0.14009482598577452,
  0.13711761412959464,
  0.13425543120289157,
  0.13144471408689717,
  0.12905195598898375,
  0.1264154611800622],
 'train_acc': [tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)],
 'val_loss': [0.3827091707633092,
  0.3089083020503706,
  0.2740706067818864,
  0.25343966277746044,
  0.2393623280983705,
  0.22958406485044042,
  0.2229269543519387,
  0.21800954869160286,
  0.2149605845029538,
  0.2100421364490802,
  0.20944883227348335,
  0.20681792107912217,
  0.2058340288125552,
  0.2045778891214958,
  0.20414738998963283,
  0.2039427431730124,
  0.2039891733573033,
  0.20289532026419277,
  0.20320990589948798,
  0.20385687832648938,
  0.2045557994108934,
  0.20566382247668039],
 'val_acc': [tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)],
 'test_loss': -1,
 'test_acc': -1}

batch_dict['x_data'].shape

torch.Size([128, 8945])

import gc

len(dataset._vectorizer.review_vocab)

classifier

ReviewClassifier(
  (fcl): Linear(in_features=8945, out_features=1, bias=True)
)

dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=args.batch_size,
                                   device=args.device)

running_loss = 0.0
running_acc = 0.0

classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    gc.collect()
    y_pred = classifier(x_in = batch_dict['x_data'].float())
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)

train_state['test_loss'] = running_loss

train_state['test_loss']

0.21400380661854376

import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def predict_rating(review, classifier, vectorizer, decision_threshold = 0.5):
    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review)).cuda()
    result = classifier(vectorized_review.view(1, -1))
    probability_value = torch.sigmoid(result).item()

    index = 1
    if probability_value < decision_threshold:
        index = 0

    return vectorizer.rating_vocab.lookup_index(index)

test_review = "this is a pretty awesome book"
prediction = predict_rating(test_review, classifier, vectorizer)
prediction

'positive'

fcl_weights = classifier.fcl.weight.detach()[0]

_, indices = torch.sort(fcl_weights, dim=0, descending = True)
indices = indices.cpu().numpy().tolist()

for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

delicious
fantastic
pleasantly
amazing
vegas
great
yum
excellent
ngreat
awesome
yummy
perfect
love
bomb
chinatown
deliciousness
solid
notch
hooked
nthank

indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

worst
mediocre
bland
horrible
meh
awful
terrible
rude
tasteless
overpriced
disgusting
slowest
unacceptable
poorly
nmaybe
unfriendly
downhill
disappointing
disappointment
underwhelmed