## @package train
# Module caffe2.python.models.seq2seq.train
import argparse
import collections
import logging
import math
import numpy as np
import random
import time
import sys
import os
import caffe2.proto.caffe2_pb2 as caffe2_pb2
from caffe2.python import core, workspace, data_parallel_model
import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stderr))
Batch = collections.namedtuple('Batch', [
'encoder_inputs',
'encoder_lengths',
'decoder_inputs',
'decoder_lengths',
'targets',
'target_weights',
])
def prepare_batch(batch):
encoder_lengths = [len(entry[0]) for entry in batch]
max_encoder_length = max(encoder_lengths)
decoder_lengths = []
max_decoder_length = max([len(entry[1]) for entry in batch])
batch_encoder_inputs = []
batch_decoder_inputs = []
batch_targets = []
batch_target_weights = []
for source_seq, target_seq in batch:
encoder_pads = (
[seq2seq_util.PAD_ID] * (max_encoder_length - len(source_seq))
)
batch_encoder_inputs.append(
list(reversed(source_seq)) + encoder_pads
)
decoder_pads = (
[seq2seq_util.PAD_ID] * (max_decoder_length - len(target_seq))
)
target_seq_with_go_token = [seq2seq_util.GO_ID] + target_seq
decoder_lengths.append(len(target_seq_with_go_token))
batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
target_seq_with_eos = target_seq + [seq2seq_util.EOS_ID]
targets = target_seq_with_eos + decoder_pads
batch_targets.append(targets)
if len(source_seq) + len(target_seq) == 0:
target_weights = [0] * len(targets)
else:
target_weights = [
1 if target != seq2seq_util.PAD_ID else 0
for target in targets
]
batch_target_weights.append(target_weights)
return Batch(
encoder_inputs=np.array(
batch_encoder_inputs,
dtype=np.int32,
).transpose(),
encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
decoder_inputs=np.array(
batch_decoder_inputs,
dtype=np.int32,
).transpose(),
decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
targets=np.array(
batch_targets,
dtype=np.int32,
).transpose(),
target_weights=np.array(
batch_target_weights,
dtype=np.float32,
).transpose(),
)
class Seq2SeqModelCaffe2(object):
def _build_model(
self,
init_params,
):
model = Seq2SeqModelHelper(init_params=init_params)
self._build_shared(model)
self._build_embeddings(model)
forward_model = Seq2SeqModelHelper(init_params=init_params)
self._build_shared(forward_model)
self._build_embeddings(forward_model)
if self.num_gpus == 0:
loss_blobs = self.model_build_fun(model)
model.AddGradientOperators(loss_blobs)
self.norm_clipped_grad_update(
model,
scope='norm_clipped_grad_update'
)
self.forward_model_build_fun(forward_model)
else:
assert (self.batch_size % self.num_gpus) == 0
data_parallel_model.Parallelize_GPU(
forward_model,
input_builder_fun=lambda m: None,
forward_pass_builder_fun=self.forward_model_build_fun,
param_update_builder_fun=None,
devices=list(range(self.num_gpus)),
)
def clipped_grad_update_bound(model):
self.norm_clipped_grad_update(
model,
scope='norm_clipped_grad_update',
)
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=lambda m: None,
forward_pass_builder_fun=self.model_build_fun,
param_update_builder_fun=clipped_grad_update_bound,
devices=list(range(self.num_gpus)),
)
self.norm_clipped_sparse_grad_update(
model,
scope='norm_clipped_sparse_grad_update',
)
self.model = model
self.forward_net = forward_model.net
def _build_shared(self, model):
optimizer_params = self.model_params['optimizer_params']
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
self.learning_rate = model.AddParam(
name='learning_rate',
init_value=float(optimizer_params['learning_rate']),
trainable=False,
)
self.global_step = model.AddParam(
name='global_step',
init_value=0,
trainable=False,
)
self.start_time = model.AddParam(
name='start_time',
init_value=time.time(),
trainable=False,
)
def _build_embeddings(self, model):
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
sqrt3 = math.sqrt(3)
self.encoder_embeddings = model.param_init_net.UniformFill(
[],
'encoder_embeddings',
shape=[
self.source_vocab_size,
self.model_params['encoder_embedding_size'],
],
min=-sqrt3,
max=sqrt3,
)
model.params.append(self.encoder_embeddings)
self.decoder_embeddings = model.param_init_net.UniformFill(
[],
'decoder_embeddings',
shape=[
self.target_vocab_size,
self.model_params['decoder_embedding_size'],
],
min=-sqrt3,
max=sqrt3,
)
model.params.append(self.decoder_embeddings)
def model_build_fun(self, model, forward_only=False, loss_scale=None):
encoder_inputs = model.net.AddExternalInput(
workspace.GetNameScope() + 'encoder_inputs',
)
encoder_lengths = model.net.AddExternalInput(
workspace.GetNameScope() + 'encoder_lengths',
)
decoder_inputs = model.net.AddExternalInput(
workspace.GetNameScope() + 'decoder_inputs',
)
decoder_lengths = model.net.AddExternalInput(
workspace.GetNameScope() + 'decoder_lengths',
)
targets = model.net.AddExternalInput(
workspace.GetNameScope() + 'targets',
)
target_weights = model.net.AddExternalInput(
workspace.GetNameScope() + 'target_weights',
)
attention_type = self.model_params['attention']
assert attention_type in ['none', 'regular', 'dot']
(
encoder_outputs,
weighted_encoder_outputs,
final_encoder_hidden_states,
final_encoder_cell_states,
encoder_units_per_layer,
) = seq2seq_util.build_embedding_encoder(
model=model,
encoder_params=self.encoder_params,
num_decoder_layers=len(self.model_params['decoder_layer_configs']),
inputs=encoder_inputs,
input_lengths=encoder_lengths,
vocab_size=self.source_vocab_size,
embeddings=self.encoder_embeddings,
embedding_size=self.model_params['encoder_embedding_size'],
use_attention=(attention_type != 'none'),
num_gpus=self.num_gpus,
)
(
decoder_outputs,
decoder_output_size,
) = seq2seq_util.build_embedding_decoder(
model,
decoder_layer_configs=self.model_params['decoder_layer_configs'],
inputs=decoder_inputs,
input_lengths=decoder_lengths,
encoder_lengths=encoder_lengths,
encoder_outputs=encoder_outputs,
weighted_encoder_outputs=weighted_encoder_outputs,
final_encoder_hidden_states=final_encoder_hidden_states,
final_encoder_cell_states=final_encoder_cell_states,
encoder_units_per_layer=encoder_units_per_layer,
vocab_size=self.target_vocab_size,
embeddings=self.decoder_embeddings,
embedding_size=self.model_params['decoder_embedding_size'],
attention_type=attention_type,
forward_only=False,
num_gpus=self.num_gpus,
)
output_logits = seq2seq_util.output_projection(
model=model,
decoder_outputs=decoder_outputs,
decoder_output_size=decoder_output_size,
target_vocab_size=self.target_vocab_size,
decoder_softmax_size=self.model_params['decoder_softmax_size'],
)
targets, _ = model.net.Reshape(
[targets],
['targets', 'targets_old_shape'],
shape=[-1],
)
target_weights, _ = model.net.Reshape(
[target_weights],
['target_weights', 'target_weights_old_shape'],
shape=[-1],
)
_, loss_per_word = model.net.SoftmaxWithLoss(
[output_logits, targets, target_weights],
['OutputProbs_INVALID', 'loss_per_word'],
only_loss=True,
)
num_words = model.net.SumElements(
[target_weights],
'num_words',
)
total_loss_scalar = model.net.Mul(
[loss_per_word, num_words],
'total_loss_scalar',
)
total_loss_scalar_weighted = model.net.Scale(
[total_loss_scalar],
'total_loss_scalar_weighted',
scale=1.0 / self.batch_size,
)
return [total_loss_scalar_weighted]
def forward_model_build_fun(self, model, loss_scale=None):
return self.model_build_fun(
model=model,
forward_only=True,
loss_scale=loss_scale
)
def _calc_norm_ratio(self, model, params, scope, ONE):
with core.NameScope(scope):
grad_squared_sums = []
for i, param in enumerate(params):
logger.info(param)
grad = (
model.param_to_grad[param]
if not isinstance(
model.param_to_grad[param],
core.GradientSlice,
) else model.param_to_grad[param].values
)
grad_squared = model.net.Sqr(
[grad],
'grad_{}_squared'.format(i),
)
grad_squared_sum = model.net.SumElements(
grad_squared,
'grad_{}_squared_sum'.format(i),
)
grad_squared_sums.append(grad_squared_sum)
grad_squared_full_sum = model.net.Sum(
grad_squared_sums,
'grad_squared_full_sum',
)
global_norm = model.net.Pow(
grad_squared_full_sum,
'global_norm',
exponent=0.5,
)
clip_norm = model.param_init_net.ConstantFill(
[],
'clip_norm',
shape=[],
value=float(self.model_params['max_gradient_norm']),
)
max_norm = model.net.Max(
[global_norm, clip_norm],
'max_norm',
Loading ...