Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self,
rnn_config: rnn.RNNConfig,
prefix=C.BIDIRECTIONALRNN_PREFIX,
layout=C.TIME_MAJOR,
encoder_class: Callable = RecurrentEncoder) -> None:
utils.check_condition(rnn_config.num_hidden % 2 == 0,
"num_hidden must be a multiple of 2 for BiDirectionalRNNEncoders.")
super().__init__(rnn_config.dtype)
self.rnn_config = rnn_config
self.internal_rnn_config = rnn_config.copy(num_hidden=rnn_config.num_hidden // 2)
if layout[0] == 'N':
logger.warning("Batch-major layout for encoder input. Consider using time-major layout for faster speed")
# time-major layout as _encode needs to swap layout for SequenceReverse
self.forward_rnn = encoder_class(rnn_config=self.internal_rnn_config,
prefix=prefix + C.FORWARD_PREFIX,
layout=C.TIME_MAJOR)
self.reverse_rnn = encoder_class(rnn_config=self.internal_rnn_config,
prefix=prefix + C.REVERSE_PREFIX,
layout=C.TIME_MAJOR)
self.layout = layout
self.prefix = prefix
def _test_parameter_averaging(model_path: str):
"""
Runs parameter averaging with all available strategies
"""
for strategy in C.AVERAGE_CHOICES:
points = sockeye.average.find_checkpoints(model_path=model_path,
size=4,
strategy=strategy,
metric=C.PERPLEXITY)
assert len(points) > 0
averaged_params = sockeye.average.average(points)
assert averaged_params
"Use any of 'dist_sync', 'dist_device_sync' and 'dist_async' for distributed "
"training. Default: %(default)s.")
train_params.add_argument("--gradient-compression-type",
type=str,
default=C.GRADIENT_COMPRESSION_NONE,
choices=C.GRADIENT_COMPRESSION_TYPES,
help='Type of gradient compression to use. Default: %(default)s.')
train_params.add_argument("--gradient-compression-threshold",
type=float,
default=0.5,
help="Threshold for gradient compression if --gctype is '2bit'. Default: %(default)s.")
train_params.add_argument('--weight-init',
type=str,
default=C.INIT_XAVIER,
choices=C.INIT_TYPES,
help='Type of base weight initialization. Default: %(default)s.')
train_params.add_argument('--weight-init-scale',
type=float,
default=3.0,
help='Weight initialization scale. Applies to uniform (scale) and xavier (magnitude). '
'Default: %(default)s.')
train_params.add_argument('--weight-init-xavier-factor-type',
type=str,
default=C.INIT_XAVIER_FACTOR_TYPE_AVG,
choices=C.INIT_XAVIER_FACTOR_TYPES,
help='Xavier factor type. Default: %(default)s.')
train_params.add_argument('--weight-init-xavier-rand-type',
type=str,
default=C.RAND_TYPE_UNIFORM,
choices=[C.RAND_TYPE_UNIFORM, C.RAND_TYPE_GAUSSIAN],
help='Xavier random number generator type. Default: %(default)s.')
if os.path.exists(params_file):
os.unlink(params_file)
os.symlink(os.path.join("..", params_base_fname), params_file)
# (2) Optimizer states
opt_state_fname = os.path.join(training_state_dirname, C.OPT_STATES_LAST)
self.model.save_optimizer_states(opt_state_fname)
# (3) Data iterator
train_iter.save_state(os.path.join(training_state_dirname, C.BUCKET_ITER_STATE_NAME))
# (4) Random generators
# RNG states: python's random and np.random provide functions for
# storing the state, mxnet does not, but inside our code mxnet's RNG is
# not used AFAIK
with open(os.path.join(training_state_dirname, C.RNG_STATE_NAME), "wb") as fp:
pickle.dump(random.getstate(), fp)
pickle.dump(np.random.get_state(), fp)
# (5) Training state
self.state.save(os.path.join(training_state_dirname, C.TRAINING_STATE_NAME))
# (6) Learning rate scheduler
with open(os.path.join(training_state_dirname, C.SCHEDULER_STATE_NAME), "wb") as fp:
pickle.dump(self.optimizer_config.lr_scheduler, fp)
# First we rename the existing directory to minimize the risk of state
# loss if the process is aborted during deletion (which will be slower
# than directory renaming)
delete_training_state_dirname = os.path.join(self.model.output_dir, C.TRAINING_STATE_TEMP_DELETENAME)
if os.path.exists(self.training_state_dirname):
os.rename(self.training_state_dirname, delete_training_state_dirname)
def __init__(self,
model: ScoringModel,
source_vocabs: List[vocab.Vocab],
target_vocab: vocab.Vocab,
constant_length_ratio: float = -1.0) -> None:
self.source_vocab_inv = vocab.reverse_vocab(source_vocabs[0])
self.target_vocab_inv = vocab.reverse_vocab(target_vocab)
self.model = model
self.exclude_list = {source_vocabs[0][C.BOS_SYMBOL], target_vocab[C.EOS_SYMBOL], C.PAD_ID}
self.constant_length_ratio = constant_length_ratio
Returns context vector and attention probabilities
via a weighted sum over values.
:param values: Shape: (batch_size, seq_len, encoder_num_hidden).
:param length: Shape: (batch_size,).
:param logits: Shape: (batch_size, seq_len, 1).
:param dtype: data type.
:return: context: (batch_size, encoder_num_hidden), attention_probs: (batch_size, seq_len).
"""
# masks attention scores according to sequence length.
# (batch_size, seq_len, 1)
logits = mx.sym.SequenceMask(data=logits,
axis=1,
use_sequence_length=True,
sequence_length=length,
value=-C.LARGE_VALUES[dtype])
# (batch_size, seq_len, 1)
probs = mx.sym.softmax(logits, axis=1, name='attention_softmax')
# batch_dot: (batch, M, K) X (batch, K, N) –> (batch, M, N).
# (batch_size, seq_len, num_hidden) X (batch_size, seq_len, 1) -> (batch_size, num_hidden, 1)
context = mx.sym.batch_dot(lhs=values, rhs=probs, transpose_a=True)
# (batch_size, encoder_num_hidden, 1)-> (batch_size, encoder_num_hidden)
context = mx.sym.reshape(data=context, shape=(0, 0))
probs = mx.sym.reshape(data=probs, shape=(0, 0))
return context, probs
help='Initial learning rate. Default: %(default)s.')
train_params.add_argument('--weight-decay',
type=float,
default=0.0,
help='Weight decay constant. Default: %(default)s.')
train_params.add_argument('--momentum',
type=float,
default=None,
help='Momentum constant. Default: %(default)s.')
train_params.add_argument('--gradient-clipping-threshold',
type=float,
default=1.0,
help='Clip absolute gradients values greater than this value. '
'Set to negative to disable. Default: %(default)s.')
train_params.add_argument('--gradient-clipping-type',
choices=C.GRADIENT_CLIPPING_TYPES,
default=C.GRADIENT_CLIPPING_TYPE_NONE,
help='The type of gradient clipping. Default: %(default)s.')
train_params.add_argument('--learning-rate-scheduler-type',
default=C.LR_SCHEDULER_PLATEAU_REDUCE,
choices=C.LR_SCHEDULERS,
help='Learning rate scheduler type. Default: %(default)s.')
train_params.add_argument('--learning-rate-reduce-factor',
type=float,
default=0.7,
help="Factor to multiply learning rate with "
"(for 'plateau-reduce' learning rate scheduler). Default: %(default)s.")
train_params.add_argument('--learning-rate-reduce-num-not-improved',
type=int,
default=8,
help="For 'plateau-reduce' learning rate scheduler. Adjust learning rate "
:return: The data iterators (train, validation, config_data) as well as the source and target vocabularies.
"""
num_words_source, num_words_target = args.num_words
num_words_source = num_words_source if num_words_source > 0 else None
num_words_target = num_words_target if num_words_target > 0 else None
word_min_count_source, word_min_count_target = args.word_min_count
batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
batch_by_words = args.batch_type == C.BATCH_TYPE_WORD
validation_sources = [args.validation_source] + args.validation_source_factors
validation_sources = [str(os.path.abspath(source)) for source in validation_sources]
validation_target = str(os.path.abspath(args.validation_target))
either_raw_or_prepared_error_msg = "Either specify a raw training corpus with %s and %s or a preprocessed corpus " \
"with %s." % (C.TRAINING_ARG_SOURCE,
C.TRAINING_ARG_TARGET,
C.TRAINING_ARG_PREPARED_DATA)
if args.prepared_data is not None:
utils.check_condition(args.source is None and args.target is None, either_raw_or_prepared_error_msg)
if not resume_training:
utils.check_condition(args.source_vocab is None and args.target_vocab is None,
"You are using a prepared data folder, which is tied to a vocabulary. "
"To change it you need to rerun data preparation with a different vocabulary.")
train_iter, validation_iter, data_config, source_vocabs, target_vocab = data_io.get_prepared_data_iters(
prepared_data_dir=args.prepared_data,
validation_sources=validation_sources,
validation_target=validation_target,
shared_vocab=shared_vocab,
batch_size=args.batch_size,
batch_by_words=batch_by_words,
batch_num_devices=batch_num_devices)
beam_histories[sent]["predicted_tokens"].append([self.vocab_target_inv[x] for x in
best_word_indices_sent])
# for later sentences in the matrix, shift from e.g. [5, 6, 7, 8, 6] to [0, 1, 3, 4, 1]
shifted_parents = best_hyp_indices[rows] - (sent * self.beam_size)
beam_histories[sent]["parent_ids"].append(shifted_parents.asnumpy().tolist())
beam_histories[sent]["scores"].append(unnormalized_scores[rows].asnumpy().flatten().tolist())
beam_histories[sent]["normalized_scores"].append(
normalized_scores[rows].asnumpy().flatten().tolist())
# Collect best hypotheses, best word indices, and attention scores
best_hyp_indices_list.append(best_hyp_indices)
best_word_indices_list.append(best_word_indices)
attentions.append(attention_scores)
if self.beam_search_stop == C.BEAM_SEARCH_STOP_FIRST:
at_least_one_finished = finished.reshape((batch_size, self.beam_size)).sum(axis=1) > 0
if at_least_one_finished.sum().asscalar() == batch_size:
break
else:
if finished.sum().asscalar() == batch_size * self.beam_size: # all finished
break
# (9) update models' state with winning hypotheses (ascending)
for ms in model_states:
ms.sort_state(best_hyp_indices)
logger.debug("Finished after %d / %d steps.", t + 1, max_output_length)
# (9) Sort the hypotheses within each sentence (normalization for finished hyps may have unsorted them).
folded_accumulated_scores = scores_accumulated.reshape((batch_size,
self.beam_size * scores_accumulated.shape[-1]))
help='Weight initialization scale. Applies to uniform (scale) and xavier (magnitude). '
'Default: %(default)s.')
train_params.add_argument('--weight-init-xavier-factor-type',
type=str,
default=C.INIT_XAVIER_FACTOR_TYPE_AVG,
choices=C.INIT_XAVIER_FACTOR_TYPES,
help='Xavier factor type. Default: %(default)s.')
train_params.add_argument('--weight-init-xavier-rand-type',
type=str,
default=C.RAND_TYPE_UNIFORM,
choices=[C.RAND_TYPE_UNIFORM, C.RAND_TYPE_GAUSSIAN],
help='Xavier random number generator type. Default: %(default)s.')
train_params.add_argument('--embed-weight-init',
type=str,
default=C.EMBED_INIT_DEFAULT,
choices=C.EMBED_INIT_TYPES,
help='Type of embedding matrix weight initialization. If normal, initializes embedding '
'weights using a normal distribution with std=1/srqt(vocab_size). '
'Default: %(default)s.')
train_params.add_argument('--initial-learning-rate',
type=float,
default=0.0002,
help='Initial learning rate. Default: %(default)s.')
train_params.add_argument('--weight-decay',
type=float,
default=0.0,
help='Weight decay constant. Default: %(default)s.')
train_params.add_argument('--momentum',
type=float,
default=None,
help='Momentum constant. Default: %(default)s.')
train_params.add_argument('--gradient-clipping-threshold',