import numpy as np
import pandas as pd
Dataset
- Yelp reviews about restaurants.
- 1- and 2-star ratings are classified as “negative”
- 3- and 4-star ratings are are “positive”
- 560K training samples, 38K testing samples. We are using 10% here.
= "../../PyTorchNLPBook/data/yelp" data_root
= pd.read_csv(data_root + "/reviews_with_splits_lite.csv")
df df
rating | review | split | |
---|---|---|---|
0 | negative | terrible place to work for i just heard a stor... | train |
1 | negative | hours , minutes total time for an extremely s... | train |
2 | negative | my less than stellar review is for service . w... | train |
3 | negative | i m granting one star because there s no way t... | train |
4 | negative | the food here is mediocre at best . i went aft... | train |
... | ... | ... | ... |
55995 | positive | great food . wonderful , friendly service . i ... | test |
55996 | positive | charlotte should be the new standard for moder... | test |
55997 | positive | get the encore sandwich ! ! make sure to get i... | test |
55998 | positive | i m a pretty big ice cream gelato fan . pretty... | test |
55999 | positive | where else can you find all the parts and piec... | test |
56000 rows × 3 columns
The dataset has been separated into ‘test’, ‘train’, and ‘val’ splits.
df.split.unique()
array(['train', 'val', 'test'], dtype=object)
The review column text has been processed to - make the text lowercase - adding spaces before and after the punctuations - replacing all other symbols with spaces
from torch.utils.data import Dataset
class Vocabulary(object):
def __init__(self, token_to_idx=None, add_unk=True, unk_token='<UNK>'):
if token_to_idx is None:
= {}
token_to_idx self._token_to_idx = token_to_idx
# create the inverse mapping
self._idx_to_token = { idx : token for token, idx in \
self._token_to_idx.items() }
self._add_unk = add_unk
self._unk_token = unk_token
self.unk_index = -1
if add_unk:
self.unk_index = self.add_token(unk_token)
def add_token(self, token):
""" Add a token to mappings and return it's index. """
if token in self._token_to_idx:
= self._token_to_idx[token]
index else:
= len(self._token_to_idx)
index self._token_to_idx[token] = index
self._idx_to_token[index] = token
return index
def lookup_token(self, token):
""" Get the index corresponding to a token """
if self._add_unk:
return self._token_to_idx.get(token, self.unk_index)
else:
return self._token_to_idx[token]
def lookup_index(self, index):
""" Get the token corresponding to an index. """
if index not in self._idx_to_token:
raise KeyError("the index (%d) is not in the Vocabulary" %index)
return self._idx_to_token[index]
def __len__(self):
return len(self._token_to_idx)
def __str__(self):
return "<Vocabulary(size=%d)>" %len(self)
def to_serializable(self):
return {
'token_to_idx': self._token_to_idx,
'add_unk': self._add_unk,
'unk_token': self._unk_token
}
@classmethod
def from_serializable(cls, contents):
return cls(**contents)
import numpy as np
import string
from collections import Counter
string.punctuation
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
class ReviewVectorizer(object):
def __init__(self, review_vocab, rating_vocab):
self.review_vocab = review_vocab
self.rating_vocab = rating_vocab
def vectorize(self, review):
""" Returns a collapsed one-hot vector for a given review. """
= np.zeros(len(self.review_vocab), dtype = np.float32)
one_hot for token in review.split(" "):
if token not in string.punctuation:
= self.review_vocab.lookup_token(token)
index = 1
one_hot[index] return one_hot
@classmethod
def from_dataframe(cls, review_df, cutoff=25):
= Vocabulary(add_unk=True)
review_vocab = Vocabulary(add_unk=False)
rating_vocab
# Add ratings
for rating in sorted(set(review_df.rating)):
rating_vocab.add_token(rating)
= Counter()
word_counts for review in review_df.review:
for word in review.split(" "):
if word not in string.punctuation:
+= 1
word_counts[word]
for word, count in word_counts.items():
if count > cutoff:
review_vocab.add_token(word)
return cls(review_vocab, rating_vocab)
def to_serializable(self):
return {
'review_vocab': self.review_vocab.to_serializable(),
'rating_vocab': self.rating_vocab.to_serializable()
}
@classmethod
def from_serializable(cls, contents):
= Vocabulary.from_serializable(contents['review_vocab'])
review_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
rating_vocab return cls(review_vocab, rating_vocab)
class ReviewDataset(Dataset):
def __init__(self, review_df, vectorizer):
self.review_df = review_df
self._vectorizer = vectorizer
self.train_df = self.review_df[self.review_df.split == 'train']
self.val_df = self.review_df[self.review_df.split == 'val']
self.test_df = self.review_df[self.review_df.split == 'test']
self.train_size = len(self.train_df)
self.val_size = len(self.val_df)
self.test_size = len(self.test_df)
self._lookup_dict = {
'train': (self.train_df, self.train_size),
'val': (self.val_df, self.val_size),
'test': (self.test_df, self.test_size)
}
self.set_split('train')
@classmethod
def load_dataset_and_make_vectorizer(cls, review_csv):
"""Load dataset and make a new vectorizer from scratch.
Args:
review_csv (str): location of the dataset
Returns:
an instance of ReviewDataset
"""
= pd.read_csv(review_csv)
review_df return cls(review_df, ReviewVectorizer.from_dataframe(review_df))
def get_vectorizer(self):
return self._vectorizer
def set_split(self, split='train'):
self._target_split = split
self._target_df, self._target_size = self._lookup_dict[split]
def __len__(self):
return self._target_size
def __getitem__(self, index):
= self._target_df.iloc[index]
row = \
review_vector self._vectorizer.vectorize(row.review)
= \
rating_index self._vectorizer.rating_vocab.lookup_token(row.rating)
return {'x_data': review_vector,
'y_target': rating_index}
def get_num_batches(self, batch_size):
return len(self) // batch_size
from torch.utils.data import DataLoader
def generate_batches(dataset, batch_size, shuffle=True,
= True, device="cpu"):
drop_last """
A minibatch generator function which wraps the PyTorch DataLoader.
It will ensure each tensor is on the right device.
"""
= DataLoader(dataset=dataset,
dataloader =batch_size,
batch_size=shuffle,
shuffle=drop_last)
drop_last
for data_dict in dataloader:
= {}
out_data_dict for name, tensor in data_dict.items():
= data_dict[name].to(device)
out_data_dict[name] yield out_data_dict
import torch.nn as nn
import torch.nn.functional as F
class ReviewClassifier(nn.Module):
def __init__(self, num_features):
"""
Args:
num_features (int): the size of the input feature vector
"""
super(ReviewClassifier, self).__init__()
self.fcl = nn.Linear(in_features=num_features,
=1)
out_features
def forward(self, x_in, apply_sigmoid=False):
= self.fcl(x_in).squeeze()
y_out if apply_sigmoid:
= F.sigmoid(y_out)
y_out return y_out
from argparse import Namespace
= Namespace(
args # Data and path information
= 25,
frequency_cutoff = 'model.pth',
model_state_file = data_root + "/reviews_with_splits_lite.csv",
review_csv = ".",
save_dir # No model hyperparams
# Training hypereparams
=128,
batch_size=5,
early_stopping_criteria=0.001,
learning_rate=15,
num_epochs=1337,
seed=True
cuda )
import torch.optim as optim
def make_train_state(args):
return {
'epoch_index': 0,
'train_loss': [],
'train_acc': [],
'val_loss': [],
'val_acc': [],
'test_loss': -1,
'test_acc': -1,
}
= make_train_state(args) train_state
import torch
if not torch.cuda.is_available():
= False args.cuda
args.cuda
True
= torch.device("cuda" if args.cuda else "cpu") args.device
args.device
device(type='cuda')
= ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
dataset = dataset.get_vectorizer() vectorizer
= ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(args.device) classifier
= nn.BCEWithLogitsLoss()
loss_func = optim.Adam(classifier.parameters(), lr = args.learning_rate) optimizer
def compute_accuracy(pred, target):
return ((target - pred) / target).sum()
10), np.ones(10)) compute_accuracy(np.zeros(
10.0
# A bare-bones training loop
for epoch_index in range(args.num_epochs):
'epoch_index'] = epoch_index
train_state[
# Iterate over training dataset
'train')
dataset.set_split(= generate_batches(dataset,
batch_generator =args.batch_size,
batch_size=args.device)
device
= 0.0
running_loss = 0.0
running_acc
classifier.train()
for batch_index, batch_dict in enumerate(batch_generator):
# zero the gradients
optimizer.zero_grad()
# compute the output
= classifier(x_in=batch_dict['x_data'].float())
y_pred
# compute the loss
= loss_func(y_pred, batch_dict['y_target'].float())
loss = loss.item()
loss_batch += (loss_batch - running_loss) / (batch_index + 1)
running_loss
# compute gradients
loss.backward()
# update params
optimizer.step()
# compute accuracy
= compute_accuracy(y_pred, batch_dict['y_target'])
acc_batch += (acc_batch - running_acc) / (batch_index + 1)
running_acc
'train_loss'].append(running_loss)
train_state['train_acc'].append(running_acc)
train_state[
# Iterate over validation dataset
'val')
dataset.set_split(= generate_batches(dataset, batch_size=args.batch_size,
batch_generator =args.device)
device= 0.0
running_loss = 0.0
running_acc eval()
classifier.
for batch_index, batch_dict in enumerate(batch_generator):
# compute the output
= classifier(x_in=batch_dict['x_data'].float())
y_pred
# compute the loss
= loss_func(y_pred, batch_dict['y_target'].float())
loss = loss.item()
loss_batch += (loss_batch - running_loss) / (batch_index + 1)
running_loss
# compute the accuracy
= compute_accuracy(y_pred, batch_dict['y_target'])
acc_batch += (acc_batch - running_acc) / (batch_index + 1)
running_acc
'val_loss'].append(running_loss)
train_state['val_acc'].append(running_acc) train_state[
KeyboardInterrupt:
train_state
{'epoch_index': 22,
'train_loss': [0.48078572292343463,
0.32938406645667323,
0.27444523850492386,
0.24338128739127932,
0.22270150414479334,
0.20760727328023101,
0.19577689098766424,
0.18615816270603858,
0.17823111704167202,
0.17131327121865514,
0.1652598268309839,
0.16008958478573887,
0.1552568533354335,
0.15096716351466236,
0.14698867341564376,
0.1434966386874128,
0.14009482598577452,
0.13711761412959464,
0.13425543120289157,
0.13144471408689717,
0.12905195598898375,
0.1264154611800622],
'train_acc': [tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)],
'val_loss': [0.3827091707633092,
0.3089083020503706,
0.2740706067818864,
0.25343966277746044,
0.2393623280983705,
0.22958406485044042,
0.2229269543519387,
0.21800954869160286,
0.2149605845029538,
0.2100421364490802,
0.20944883227348335,
0.20681792107912217,
0.2058340288125552,
0.2045778891214958,
0.20414738998963283,
0.2039427431730124,
0.2039891733573033,
0.20289532026419277,
0.20320990589948798,
0.20385687832648938,
0.2045557994108934,
0.20566382247668039],
'val_acc': [tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>),
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)],
'test_loss': -1,
'test_acc': -1}
'x_data'].shape batch_dict[
torch.Size([128, 8945])
import gc
len(dataset._vectorizer.review_vocab)
8945
classifier
ReviewClassifier(
(fcl): Linear(in_features=8945, out_features=1, bias=True)
)
'test')
dataset.set_split(= generate_batches(dataset, batch_size=args.batch_size,
batch_generator =args.device)
device
= 0.0
running_loss = 0.0
running_acc
eval()
classifier.
for batch_index, batch_dict in enumerate(batch_generator):
gc.collect()= classifier(x_in = batch_dict['x_data'].float())
y_pred = loss_func(y_pred, batch_dict['y_target'].float())
loss = loss.item()
loss_batch += (loss_batch - running_loss) / (batch_index + 1)
running_loss
'test_loss'] = running_loss train_state[
'test_loss'] train_state[
0.21400380661854376
import re
def preprocess_text(text):
= text.lower()
text = re.sub(r"([.,!?])", r" \1 ", text)
text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
text return text
def predict_rating(review, classifier, vectorizer, decision_threshold = 0.5):
= preprocess_text(review)
review = torch.tensor(vectorizer.vectorize(review)).cuda()
vectorized_review = classifier(vectorized_review.view(1, -1))
result = torch.sigmoid(result).item()
probability_value
= 1
index if probability_value < decision_threshold:
= 0
index
return vectorizer.rating_vocab.lookup_index(index)
= "this is a pretty awesome book"
test_review = predict_rating(test_review, classifier, vectorizer)
prediction prediction
'positive'
= classifier.fcl.weight.detach()[0] fcl_weights
= torch.sort(fcl_weights, dim=0, descending = True)
_, indices = indices.cpu().numpy().tolist() indices
for i in range(20):
print(vectorizer.review_vocab.lookup_index(indices[i]))
delicious
fantastic
pleasantly
amazing
vegas
great
yum
excellent
ngreat
awesome
yummy
perfect
love
bomb
chinatown
deliciousness
solid
notch
hooked
nthank
indices.reverse()for i in range(20):
print(vectorizer.review_vocab.lookup_index(indices[i]))
worst
mediocre
bland
horrible
meh
awful
terrible
rude
tasteless
overpriced
disgusting
slowest
unacceptable
poorly
nmaybe
unfriendly
downhill
disappointing
disappointment
underwhelmed