This is a modification of https://github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb using the Tensorflow 2.0 Keras implementation of BERT from kpe/bert-for-tf2 with the original google-research/bert weights.
Predicting Movie Review Sentiment with kpe/bert-for-tf2
First install some prerequisites:
import os
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
tf.__version__
In addition to the standard libraries we imported above, we’ll need to install the bert-for-tf2 python package, and do the imports required for loading the pre-trained weights and tokenizing the input text.
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
#Data
First, let’s download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from this Tensorflow tutorial.
from tensorflow import keras
import os
import re
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
data = {}
data["sentence"] = []
data["sentiment"] = []
for file_path in tqdm(os.listdir(directory), desc=os.path.basename(directory)):
with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
data["sentence"].append(f.read())
data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
return pd.DataFrame.from_dict(data)
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
pos_df = load_directory_data(os.path.join(directory, "pos"))
neg_df = load_directory_data(os.path.join(directory, "neg"))
pos_df["polarity"] = 1
neg_df["polarity"] = 0
return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
# dataset = tf.keras.utils.get_file(
# fname="aclImdb.tar.gz",
# origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
# extract=True)
dataset = "./aclImdb"
train_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "train"))
test_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "test"))
return train_df, test_df
Let’s use the MovieReviewData
class below, to prepare/encode
the data for feeding into our BERT model, by:
- tokenizing the text
- trim or pad it to a
max_seq_len
length - append the special tokens
[CLS]
and[SEP]
- convert the string tokens to numerical
ID
s using the original model’s token encoding from `vocab.txt
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization import FullTokenizer
from bert import bert_tokenization
class MovieReviewData:
DATA_COLUMN = “sentence”
LABEL_COLUMN = “polarity”
def __init__(self, tokenizer: bert_tokenization.FullTokenizer, sample_size=None, max_seq_len=1024):
self.tokenizer = tokenizer
self.sample_size = sample_size
self.max_seq_len = 0
train, test = download_and_load_datasets()
train, test = map(lambda df: df.reindex(df[MovieReviewData.DATA_COLUMN].str.len().sort_values().index),
[train, test])
if sample_size is not None:
assert sample_size % 128 == 0
train, test = train.head(sample_size), test.head(sample_size)
# train, test = map(lambda df: df.sample(sample_size), [train, test])
((self.train_x, self.train_y),
(self.test_x, self.test_y)) = map(self._prepare, [train, test])
print("max seq_len", self.max_seq_len)
self.max_seq_len = min(self.max_seq_len, max_seq_len)
((self.train_x, self.train_x_token_types),
(self.test_x, self.test_x_token_types)) = map(self._pad,
[self.train_x, self.test_x])
def _prepare(self, df):
x, y = [], []
with tqdm(total=df.shape[0], unit_scale=True) as pbar:
for ndx, row in df.iterrows():
text, label = row[MovieReviewData.DATA_COLUMN], row[MovieReviewData.LABEL_COLUMN]
tokens = self.tokenizer.tokenize(text)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
self.max_seq_len = max(self.max_seq_len, len(token_ids))
x.append(token_ids)
y.append(int(label))
pbar.update()
return np.array(x), np.array(y)
def _pad(self, ids):
x, t = [], []
token_type_ids = [0] * self.max_seq_len
for input_ids in ids:
input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
x.append(np.array(input_ids))
t.append(token_type_ids)
return np.array(x), np.array(t)