Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_check_condition_true():
utils.check_condition(1 == 1, "Nice")
help="Custom test data (pairs of source and target).")
arg_parser.add_argument("--custom-text-type", type=str, choices=CUSTOM_TEXT_TYPES, default=CUSTOM_UTF8_RAW,
help="Level of pre-processing already applied to data for custom task: none (raw), tokenization, or byte-pair encoding. Default: %(default)s.")
arg_parser.add_argument("--custom-lang", type=str, nargs=2, metavar=("SRC", "TRG"),
help="Source and target language codes for custom task (en, fr, de, etc.).")
arg_parser.add_argument("--custom-bpe-op", type=int, default=32000,
help="Number of byte-pair encoding operations for custom task. Default: %(default)s.")
arg_parser.add_argument("--gpus", type=int, metavar="N", default=1,
help="Number of GPUs to use. 0 for CPU only. Default: %(default)s.")
arg_parser.add_argument("--test", action="store_true", default=False,
help="Run in test mode (much abbreviated system build).")
args = arg_parser.parse_args()
# Listed task or fully specified custom task
utils.check_condition(args.task or all((args.custom_train, args.custom_dev, args.custom_test)),
"Please specify --task or all of: --custom-task --custom-train --custom-dev --custom-test")
# Required args for different custom tasks
if not args.task:
if args.custom_text_type == CUSTOM_UTF8_RAW:
utils.check_condition(args.custom_lang, "Please specify --custom-lang for source and target tokenization")
# Require explicit request to not train model
if not args.model:
raise RuntimeError("Please specify --model. Use --model %s to run data preparation steps only" % MODEL_NONE)
run_steps(args)
"""
utils.check_condition(len(metrics) > 0, "At least one metric must be provided.")
for metric in metrics:
utils.check_condition(metric in C.METRICS, "Unknown metric to track during training: %s" % metric)
if 'dist' in self.optimizer_config.kvstore:
# In distributed training the optimizer will run remotely. For eve we however need to pass information about
# the loss, which is not possible anymore by means of accessing self.module._curr_module._optimizer.
utils.check_condition(self.optimizer_config.name != C.OPTIMIZER_EVE,
"Eve optimizer not supported with distributed training.")
utils.check_condition(
not issubclass(type(self.optimizer_config.lr_scheduler),
lr_scheduler.AdaptiveLearningRateScheduler),
"Adaptive learning rate schedulers not supported with a dist kvstore. "
"Try a fixed schedule such as %s." % C.LR_SCHEDULER_FIXED_RATE_INV_SQRT_T)
utils.check_condition(not lr_decay_param_reset, "Parameter reset when the learning rate decays not "
"supported with distributed training.")
utils.check_condition(lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_OFF,
"Optimizer state reset when the learning rate decays "
"not supported with distributed training.")
utils.check_condition(self.optimizer_config.gradient_clipping_type in C.GRADIENT_CLIPPING_TYPES,
"Unknown gradient clipping type %s" % self.optimizer_config.gradient_clipping_type)
utils.check_condition(early_stopping_metric in C.METRICS,
"Unsupported early-stopping metric: %s" % early_stopping_metric)
if early_stopping_metric in C.METRICS_REQUIRING_DECODER:
utils.check_condition(cp_decoder is not None, "%s requires CheckpointDecoder" % early_stopping_metric)
validation_target=validation_target,
shared_vocab=shared_vocab,
batch_size=args.batch_size,
batch_by_words=batch_by_words,
batch_num_devices=batch_num_devices)
check_condition(args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_SUM \
or len(source_vocabs) == len(args.source_factors_num_embed) + 1,
"Data was prepared with %d source factors, but only provided %d source factor dimensions." % (
len(source_vocabs), len(args.source_factors_num_embed) + 1))
if resume_training:
# resuming training. Making sure the vocabs in the model and in the prepared data match up
model_source_vocabs = vocab.load_source_vocabs(output_folder)
for i, (v, mv) in enumerate(zip(source_vocabs, model_source_vocabs)):
utils.check_condition(vocab.are_identical(v, mv),
"Prepared data and resumed model source vocab %d do not match." % i)
model_target_vocab = vocab.load_target_vocab(output_folder)
utils.check_condition(vocab.are_identical(target_vocab, model_target_vocab),
"Prepared data and resumed model target vocabs do not match.")
check_condition(data_config.num_source_factors == len(validation_sources),
'Training and validation data must have the same number of factors, but found %d and %d.' % (
data_config.num_source_factors, len(validation_sources)))
return train_iter, validation_iter, data_config, source_vocabs, target_vocab
else:
utils.check_condition(args.prepared_data is None and args.source is not None and args.target is not None,
either_raw_or_prepared_error_msg)
if resume_training:
num_words_target = num_words_target if num_words_target > 0 else None
word_min_count_source, word_min_count_target = args.word_min_count
batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
batch_by_words = args.batch_type == C.BATCH_TYPE_WORD
validation_sources = [args.validation_source] + args.validation_source_factors
validation_sources = [str(os.path.abspath(source)) for source in validation_sources]
validation_target = str(os.path.abspath(args.validation_target))
either_raw_or_prepared_error_msg = "Either specify a raw training corpus with %s and %s or a preprocessed corpus " \
"with %s." % (C.TRAINING_ARG_SOURCE,
C.TRAINING_ARG_TARGET,
C.TRAINING_ARG_PREPARED_DATA)
if args.prepared_data is not None:
utils.check_condition(args.source is None and args.target is None, either_raw_or_prepared_error_msg)
if not resume_training:
utils.check_condition(args.source_vocab is None and args.target_vocab is None,
"You are using a prepared data folder, which is tied to a vocabulary. "
"To change it you need to rerun data preparation with a different vocabulary.")
train_iter, validation_iter, data_config, source_vocabs, target_vocab = data_io.get_prepared_data_iters(
prepared_data_dir=args.prepared_data,
validation_sources=validation_sources,
validation_target=validation_target,
shared_vocab=shared_vocab,
batch_size=args.batch_size,
batch_by_words=batch_by_words,
batch_num_devices=batch_num_devices)
check_condition(args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_SUM \
or len(source_vocabs) == len(args.source_factors_num_embed) + 1,
"Data was prepared with %d source factors, but only provided %d source factor dimensions." % (
:param resume_training: Whether to resume training.
:param output_folder: Output folder.
:return: The data iterators (train, validation, config_data) as well as the source and target vocabularies.
"""
_, num_words_target = args.num_words
num_words_target = num_words_target if num_words_target > 0 else None
_, word_min_count_target = args.word_min_count
batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
batch_by_words = args.batch_type == C.BATCH_TYPE_WORD
either_raw_or_prepared_error_msg = "Either specify a raw training corpus with %s or a preprocessed corpus " \
"with %s." % (C.TRAINING_ARG_TARGET,
C.TRAINING_ARG_PREPARED_DATA)
# Note: ignore args.prepared_data for the moment
utils.check_condition(args.prepared_data is None and args.target is not None,
either_raw_or_prepared_error_msg)
if resume_training:
# Load the existing vocab created when starting the training run.
target_vocab = vocab.vocab_from_json(os.path.join(output_folder, C.VOCAB_TRG_NAME))
# Recover the vocabulary path from the existing config file:
data_info = cast(data_io.DataInfo, Config.load(os.path.join(output_folder, C.DATA_INFO)))
target_vocab_path = data_info.target_vocab
else:
# Load vocab:
target_vocab_path = args.target_vocab
# Note: We do not care about the source vocab for images, that is why some inputs are mocked
target_vocab = vocab.load_or_create_vocab(data=args.target,
vocab_path=target_vocab_path,
num_words=num_words_target,
help="Number of byte-pair encoding operations for custom task. Default: %(default)s.")
arg_parser.add_argument("--gpus", type=int, metavar="N", default=1,
help="Number of GPUs to use. 0 for CPU only. Default: %(default)s.")
arg_parser.add_argument("--test", action="store_true", default=False,
help="Run in test mode (much abbreviated system build).")
args = arg_parser.parse_args()
# Listed task or fully specified custom task
utils.check_condition(args.task or all((args.custom_train, args.custom_dev, args.custom_test)),
"Please specify --task or all of: --custom-task --custom-train --custom-dev --custom-test")
# Required args for different custom tasks
if not args.task:
if args.custom_text_type == CUSTOM_UTF8_RAW:
utils.check_condition(args.custom_lang, "Please specify --custom-lang for source and target tokenization")
# Require explicit request to not train model
if not args.model:
raise RuntimeError("Please specify --model. Use --model %s to run data preparation steps only" % MODEL_NONE)
run_steps(args)
if input_file is None:
check_condition(input_factors is None, "Translating from STDIN, not expecting any factor files.")
for sentence_id, line in enumerate(sys.stdin, 1):
if input_is_json:
yield inference.make_input_from_json_string(sentence_id=sentence_id,
json_string=line,
translator=translator)
else:
yield inference.make_input_from_factored_string(sentence_id=sentence_id,
factored_string=line,
translator=translator)
else:
input_factors = [] if input_factors is None else input_factors
inputs = [input_file] + input_factors
if not input_is_json:
check_condition(translator.num_source_factors == len(inputs),
"Model(s) require %d factors, but %d given (through --input and --input-factors)." % (
translator.num_source_factors, len(inputs)))
with ExitStack() as exit_stack:
streams = [exit_stack.enter_context(data_io.smart_open(i)) for i in inputs] # pylint: disable=no-member
for sentence_id, inputs in enumerate(zip(*streams), 1):
if input_is_json:
yield inference.make_input_from_json_string(sentence_id=sentence_id,
json_string=inputs[0],
translator=translator)
else:
yield inference.make_input_from_multiple_strings(sentence_id=sentence_id, strings=list(inputs))
"Must provide None or average target length for each bucket")
data_target_average_len = list(data_target_average_len)
bucket_batch_sizes = [] # type: List[BucketBatchSize]
largest_total_num_words = 0
for buck_idx, bucket in enumerate(buckets):
# Target/label length with padding
padded_seq_len = bucket[1]
# Average target/label length excluding padding
if data_target_average_len[buck_idx] is None:
data_target_average_len[buck_idx] = padded_seq_len
average_seq_len = data_target_average_len[buck_idx]
# Word-based: num words determines num sentences
# Sentence-based: num sentences determines num words
if batch_by_words:
check_condition(padded_seq_len <= batch_size, "Word batch size must cover sequence lengths for all"
" buckets: (%d > %d)" % (padded_seq_len, batch_size))
# Multiple of number of devices (int) closest to target number of words, assuming each sentence is of
# average length
batch_size_seq = batch_num_devices * max(1, round((batch_size / average_seq_len) / batch_num_devices))
batch_size_word = batch_size_seq * average_seq_len
else:
batch_size_seq = batch_size
batch_size_word = batch_size_seq * average_seq_len
bucket_batch_sizes.append(BucketBatchSize(bucket, batch_size_seq, batch_size_word))
# Track largest number of source or target word samples in a batch
largest_total_num_words = max(largest_total_num_words, batch_size_seq * max(*bucket))
# Final step: guarantee that largest bucket by sequence length also has a batch size so that it covers any
# (batch_size, len_source) and (batch_size, len_target) matrix from the data iterator to allow for memory sharing.
# When batching by sentences, this will already be the case.
if batch_by_words:
def score(args: argparse.Namespace):
setup_main_logger(file_logging=False,
console=not args.quiet,
level=args.loglevel) # pylint: disable=no-member
utils.log_basic_info(args)
with ExitStack() as exit_stack:
context = utils.determine_context(device_ids=args.device_ids,
use_cpu=args.use_cpu,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)
if args.batch_type == C.BATCH_TYPE_SENTENCE:
check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
"divisible by the number of devices. Choose a batch "
"size that is a multiple of %d." % len(context))
logger.info("Scoring Device(s): %s", ", ".join(str(c) for c in context))
# This call has a number of different parameters compared to training which reflect our need to get scores
# one-for-one and in the same order as the input data.
# To enable code reuse, we stuff the `args` parameter with some values.
# Bucketing and permuting need to be turned off in order to preserve the ordering of sentences.
# Finally, 'resume_training' needs to be set to True because it causes the model to be loaded instead of initialized.
args.no_bucketing = True
args.bucket_width = 10
score_iter, source_vocabs, target_vocab, model_config = get_data_iters_and_vocabs(
args=args,
model_folder=args.model)
scoring_model = scoring.ScoringModel(config=model_config,