## @package char_rnn
# Module caffe2.python.examples.char_rnn
from caffe2.python import core, workspace, model_helper, utils, brew
from caffe2.python.rnn_cell import LSTM
from caffe2.proto import caffe2_pb2
from caffe2.python.optimizer import build_sgd
import argparse
import logging
import numpy as np
from datetime import datetime
'''
This script takes a text file as input and uses a recurrent neural network
to learn to predict next character in a sequence.
'''
logging.basicConfig()
log = logging.getLogger("char_rnn")
log.setLevel(logging.DEBUG)
# Default set() here is intentional as it would accumulate values like a global
# variable
def CreateNetOnce(net, created_names=set()): # noqa
name = net.Name()
if name not in created_names:
created_names.add(name)
workspace.CreateNet(net)
class CharRNN(object):
def __init__(self, args):
self.seq_length = args.seq_length
self.batch_size = args.batch_size
self.iters_to_report = args.iters_to_report
self.hidden_size = args.hidden_size
with open(args.train_data) as f:
self.text = f.read()
self.vocab = list(set(self.text))
self.char_to_idx = {ch: idx for idx, ch in enumerate(self.vocab)}
self.idx_to_char = {idx: ch for idx, ch in enumerate(self.vocab)}
self.D = len(self.char_to_idx)
print("Input has {} characters. Total input size: {}".format(
len(self.vocab), len(self.text)))
def CreateModel(self):
log.debug("Start training")
model = model_helper.ModelHelper(name="char_rnn")
input_blob, seq_lengths, hidden_init, cell_init, target = \
model.net.AddExternalInputs(
'input_blob',
'seq_lengths',
'hidden_init',
'cell_init',
'target',
)
hidden_output_all, self.hidden_output, _, self.cell_state = LSTM(
model, input_blob, seq_lengths, (hidden_init, cell_init),
self.D, self.hidden_size, scope="LSTM")
output = brew.fc(
model,
hidden_output_all,
None,
dim_in=self.hidden_size,
dim_out=self.D,
axis=2
)
# axis is 2 as first two are T (time) and N (batch size).
# We treat them as one big batch of size T * N
softmax = model.net.Softmax(output, 'softmax', axis=2)
softmax_reshaped, _ = model.net.Reshape(
softmax, ['softmax_reshaped', '_'], shape=[-1, self.D])
# Create a copy of the current net. We will use it on the forward
# pass where we don't need loss and backward operators
self.forward_net = core.Net(model.net.Proto())
xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent')
# Loss is average both across batch and through time
# Thats why the learning rate below is multiplied by self.seq_length
loss = model.net.AveragedLoss(xent, 'loss')
model.AddGradientOperators([loss])
# use build_sdg function to build an optimizer
build_sgd(
model,
base_learning_rate=0.1 * self.seq_length,
policy="step",
stepsize=1,
gamma=0.9999
)
self.model = model
self.predictions = softmax
self.loss = loss
self.prepare_state = core.Net("prepare_state")
self.prepare_state.Copy(self.hidden_output, hidden_init)
self.prepare_state.Copy(self.cell_state, cell_init)
def _idx_at_pos(self, pos):
return self.char_to_idx[self.text[pos]]
def TrainModel(self):
log.debug("Training model")
workspace.RunNetOnce(self.model.param_init_net)
# As though we predict the same probability for each character
smooth_loss = -np.log(1.0 / self.D) * self.seq_length
last_n_iter = 0
last_n_loss = 0.0
num_iter = 0
N = len(self.text)
# We split text into batch_size pieces. Each piece will be used only
# by a corresponding batch during the training process
text_block_positions = np.zeros(self.batch_size, dtype=np.int32)
text_block_size = N // self.batch_size
text_block_starts = list(range(0, N, text_block_size))
text_block_sizes = [text_block_size] * self.batch_size
text_block_sizes[self.batch_size - 1] += N % self.batch_size
assert sum(text_block_sizes) == N
# Writing to output states which will be copied to input
# states within the loop below
workspace.FeedBlob(self.hidden_output, np.zeros(
[1, self.batch_size, self.hidden_size], dtype=np.float32
))
workspace.FeedBlob(self.cell_state, np.zeros(
[1, self.batch_size, self.hidden_size], dtype=np.float32
))
workspace.CreateNet(self.prepare_state)
# We iterate over text in a loop many times. Each time we peak
# seq_length segment and feed it to LSTM as a sequence
last_time = datetime.now()
progress = 0
while True:
workspace.FeedBlob(
"seq_lengths",
np.array([self.seq_length] * self.batch_size,
dtype=np.int32)
)
workspace.RunNet(self.prepare_state.Name())
input = np.zeros(
[self.seq_length, self.batch_size, self.D]
).astype(np.float32)
target = np.zeros(
[self.seq_length * self.batch_size]
).astype(np.int32)
for e in range(self.batch_size):
for i in range(self.seq_length):
pos = text_block_starts[e] + text_block_positions[e]
input[i][e][self._idx_at_pos(pos)] = 1
target[i * self.batch_size + e] =\
self._idx_at_pos((pos + 1) % N)
text_block_positions[e] = (
text_block_positions[e] + 1) % text_block_sizes[e]
progress += 1
workspace.FeedBlob('input_blob', input)
workspace.FeedBlob('target', target)
CreateNetOnce(self.model.net)
workspace.RunNet(self.model.net.Name())
num_iter += 1
last_n_iter += 1
if num_iter % self.iters_to_report == 0:
new_time = datetime.now()
print("Characters Per Second: {}". format(
int(progress / (new_time - last_time).total_seconds())
))
print("Iterations Per Second: {}". format(
int(self.iters_to_report /
(new_time - last_time).total_seconds())
))
last_time = new_time
progress = 0
print("{} Iteration {} {}".
format('-' * 10, num_iter, '-' * 10))
loss = workspace.FetchBlob(self.loss) * self.seq_length
smooth_loss = 0.999 * smooth_loss + 0.001 * loss
last_n_loss += loss
if num_iter % self.iters_to_report == 0:
self.GenerateText(500, np.random.choice(self.vocab))
log.debug("Loss since last report: {}"
.format(last_n_loss / last_n_iter))
log.debug("Smooth loss: {}".format(smooth_loss))
last_n_loss = 0.0
last_n_iter = 0
def GenerateText(self, num_characters, ch):
# Given a starting symbol we feed a fake sequence of size 1 to
# our RNN num_character times. After each time we use output
# probabilities to pick a next character to feed to the network.
# Same character becomes part of the output
CreateNetOnce(self.forward_net)
text = '' + ch
for _i in range(num_characters):
workspace.FeedBlob(
"seq_lengths", np.array([1] * self.batch_size, dtype=np.int32))
workspace.RunNet(self.prepare_state.Name())
input = np.zeros([1, self.batch_size, self.D]).astype(np.float32)
input[0][0][self.char_to_idx[ch]] = 1
workspace.FeedBlob("input_blob", input)
workspace.RunNet(self.forward_net.Name())
p = workspace.FetchBlob(self.predictions)
next = np.random.choice(self.D, p=p[0][0])
ch = self.idx_to_char[next]
text += ch
print(text)
@utils.debug
def main():
parser = argparse.ArgumentParser(
description="Caffe2: Char RNN Training"
)
parser.add_argument("--train_data", type=str, default=None,
help="Path to training data in a text file format",
required=True)
parser.add_argument("--seq_length", type=int, default=25,
help="One training example sequence length")
parser.add_argument("--batch_size", type=int, default=1,
help="Training batch size")
parser.add_argument("--iters_to_report", type=int, default=500,
help="How often to report loss and generate text")
parser.add_argument("--hidden_size", type=int, default=100,
help="Dimension of the hidden representation")
parser.add_argument("--gpu", action="store_true",
help="If set, training is going to use GPU 0")
args = parser.parse_args()
device = core.DeviceOption(
workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 0)
with core.DeviceScope(device):
model = CharRNN(args)
model.CreateModel()
model.TrainModel()
if __name__ == '__main__':
workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
main()