Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_horovod_allreduce_error(self):
"""Test that the allreduce raises an error if different ranks try to
send tensors of different rank or dimension."""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
# Same rank, different dimension
dims = [17 + rank] * 3
tensor = self.random_uniform(dims, -1.0, 1.0)
with self.assertRaises(tf.errors.FailedPreconditionError):
self.evaluate(hvd.allreduce(tensor))
# Same number of elements, different rank
if rank == 0:
dims = [17, 23 * 57]
def setup_horovod_execution(self):
"""
Sets up Horovod.
"""
# Check again to avoid import if unset which will crash if horovod is not installed.
if get_distributed_backend() == "horovod":
import horovod.tensorflow as hvd
self.logger.info("Setting up Horovod execution.")
hvd.init()
config = tf.ConfigProto()
config.gpu_options.visible_device_list = str(hvd.local_rank())
def main(hps):
# Initialize Horovod.
hvd.init()
# Create tensorflow session
sess = tensorflow_session()
# Download and load dataset.
tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed)
np.random.seed(hvd.rank() + hvd.size() * hps.seed)
# Get data and set train_its and valid_its
train_iterator_A, test_iterator_A, data_init_A, train_iterator_B, test_iterator_B, data_init_B = get_data(hps, sess)
hps.train_its, hps.test_its, hps.full_test_its = get_its(hps)
# Create log dir
logdir = os.path.abspath(hps.logdir) + "/"
if not os.path.exists(logdir):
os.mkdir(logdir)
def main():
# setup horovod
start = time()
hvd.init()
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
# get command line args
cmdline = add_cli_args()
FLAGS, unknown_args = cmdline.parse_known_args()
ds = create_data(FLAGS.data_dir, FLAGS.synthetic, FLAGS.batch_size)
model = tf.keras.applications.ResNet50(weights=None, classes=1000)
opt = tf.keras.optimizers.SGD(learning_rate=FLAGS.learning_rate * hvd.size(), momentum=0.1)
loss_func = tf.keras.losses.SparseCategoricalCrossentropy()
loop_time = time()
if hvd.local_rank() == 0:
def main(_):
"""
Builds the model and runs
"""
if FLAGS.distributed:
import horovod.tensorflow as hvd
hvd.init()
tf.logging.set_verbosity(tf.logging.INFO)
# Loads GPT-2 model configuration
if FLAGS.config_type == "json":
gpt2_config = model_utils.transform_gpt2_to_texar_config(
FLAGS.config_model)
elif FLAGS.config_type == 'texar':
gpt2_config = importlib.import_module(
FLAGS.config_model)
else:
raise ValueError('Unknown config_type.')
# Creates a data pre-processor for, e.g., BPE encoding
proc = processor.get_encoder(FLAGS.pretrain_model_dir)
model=MODEL,
session_init=get_model_loader(args.load),
input_names=MODEL.get_inference_tensor_names()[0],
output_names=MODEL.get_inference_tensor_names()[1]))
if args.evaluate:
assert args.evaluate.endswith('.json'), args.evaluate
offline_evaluate(pred, args.evaluate)
elif args.predict:
COCODetection(cfg.DATA.BASEDIR, 'val2014') # Only to load the class names into caches
predict(pred, args.predict)
# train part
else:
is_horovod = cfg.TRAINER == 'horovod'
if is_horovod:
hvd.init()
logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size()))
if not is_horovod or hvd.rank() == 0:
logger.set_logger_dir(args.logdir, 'd')
finalize_configs(is_training=True)
stepnum = cfg.TRAIN.STEPS_PER_EPOCH # STEPS_PER_EPOCH = 500
# warmup is step based, lr is epoch based
init_lr = cfg.TRAIN.BASE_LR * 0.33 * min(8. / cfg.TRAIN.NUM_GPUS, 1.)
warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)]
warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum #1000/500
lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]
factor = 8. / cfg.TRAIN.NUM_GPUS
for idx, steps in enumerate(cfg.TRAIN.LR_SCHEDULE[:-1]):
from __future__ import print_function
import collections
import math
import os
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import horovod.tensorflow as hvd
# Horovod: initialize Horovod.
hvd.init()
# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/text8.zip'
def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception(
if dtype not in [tf.float32, tf.float16]:
raise ValueError("Unknown dtype received: %s (allowed: `tf.float32` and `tf.float16`)" % dtype)
if compute_format not in ["NHWC", 'NCHW']:
raise ValueError("Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])" % compute_format)
if input_format not in ["NHWC", 'NCHW']:
raise ValueError("Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])" % input_format)
if n_channels not in [1, 3]:
raise ValueError("Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))" % n_channels)
if data_dir is not None and not os.path.exists(data_dir):
raise ValueError("The `data_dir` received does not exists: %s" % data_dir)
hvd.init()
tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None
# ============================================
# Optimsation Flags - Do not remove
# ============================================
os.environ['CUDA_CACHE_DISABLE'] = '0'
os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd() else str(hvd.size())
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
def main():
if _DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger = _get_logger()
logger.info("Runnin Distributed")
else:
logger = _get_logger()
logger.info("Tensorflow version {}".format(tf.__version__))
if _FAKE:
train_input_fn, validation_input_fn = _create_fake_data_fn()
else:
train_input_fn, validation_input_fn = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'),
os.getenv('AZ_BATCHAI_INPUT_TEST'))
run_config = _get_runconfig()
model_dir = _get_model_dir()
params = {"learning_rate": _LR,
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--load', help='load a model for evaluation. Can overwrite BACKBONE.WEIGHTS')
parser.add_argument('--logdir', help='log directory', default='train_log/maskrcnn')
parser.add_argument('--config', help="A list of KEY=VALUE to overwrite those defined in tensorpack_config.py",
nargs='+')
args = parser.parse_args()
if args.config:
cfg.update_args(args.config)
MODEL = ResNetC4Model()
is_horovod = cfg.TRAINER == 'horovod'
if is_horovod:
hvd.init()
logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size()))
if not is_horovod or hvd.rank() == 0:
logger.set_logger_dir(args.logdir, 'd')
finalize_configs(is_training=True)
stepnum = cfg.TRAIN.STEPS_PER_EPOCH # STEPS_PER_EPOCH = 5000
# warmup is step based, lr is epoch based
init_lr = cfg.TRAIN.BASE_LR * 0.33 * min(8. / cfg.TRAIN.NUM_GPUS, 1.)
warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)]
warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum # 1000/500
lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]
factor = 8. / cfg.TRAIN.NUM_GPUS
for idx, steps in enumerate(cfg.TRAIN.LR_SCHEDULE[:-1]):
mult = 0.1 ** (idx + 1)