Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data_iter.reset()
metric = mx.metric.Accuracy()
for _, batch in enumerate(data_iter):
data = batch.data[0].as_in_context(context)
label = batch.label[0].as_in_context(context)
output = model(data.astype(args.dtype, copy=False))
metric.update([label], [output])
return metric.get()
# Initialize Horovod
hvd.init()
# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()
# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())
# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()
# Create optimizer
optimizer_params = {'momentum': args.momentum,
'learning_rate': args.lr * hvd.size()}
opt = mx.optimizer.create('sgd', **optimizer_params)
# Initialize parameters
data_iter.reset()
metric = mx.metric.Accuracy()
for _, batch in enumerate(data_iter):
data = batch.data[0].as_in_context(context)
label = batch.label[0].as_in_context(context)
output = model(data.astype(args.dtype, copy=False))
metric.update([label], [output])
return metric.get()
# Initialize Horovod
hvd.init()
# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()
# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())
# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()
# Create optimizer
optimizer_params = {'momentum': args.momentum,
'learning_rate': args.lr * hvd.size()}
opt = mx.optimizer.create('sgd', **optimizer_params)
# Initialize parameters
label = batch.label[0].as_in_context(context)
output = model(data.astype(args.dtype, copy=False))
metric.update([label], [output])
return metric.get()
# Initialize Horovod
hvd.init()
# Polyaxon
if hvd.rank() == 0:
experiment = Experiment()
# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()
# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())
# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()
# Define hyper parameters
optimizer_params = {'momentum': args.momentum,
'learning_rate': args.lr * hvd.size(),
'rescale_grad': 1.0 / args.batch_size}
# Add Horovod Distributed Optimizer
# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()
# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())
# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()
# Define hyper parameters
optimizer_params = {'momentum': args.momentum,
'learning_rate': args.lr * hvd.size(),
'rescale_grad': 1.0 / args.batch_size}
# Add Horovod Distributed Optimizer
opt = mx.optimizer.create('sgd', **optimizer_params)
opt = hvd.DistributedOptimizer(opt)
# Initialize parameters
initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
magnitude=2)
model.initialize(initializer, ctx=context)
# Fetch and broadcast parameters
params = model.collect_params()
if params is not None:
hvd.broadcast_parameters(params, root_rank=0)
zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
dirname=data_dir)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(data_dir)
input_shape = (1, 28, 28)
batch_size = args.batch_size
train_iter = mx.io.MNISTIter(
image="%s/train-images-idx3-ubyte" % data_dir,
label="%s/train-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
shuffle=True,
flat=False,
num_parts=hvd.size(),
part_index=hvd.rank()
)
val_iter = mx.io.MNISTIter(
image="%s/t10k-images-idx3-ubyte" % data_dir,
label="%s/t10k-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
flat=False,
)
return train_iter, val_iter
# Create optimizer
# Note that when using Module API, we need to specify rescale_grad since
# we create optimizer first and wrap it with DistributedOptimizer. For
# Gluon API, it is handled in Trainer.step() function so there is no need
# to specify rescale_grad (see above train_gluon() function).
optimizer_params = {'wd': args.wd,
'momentum': args.momentum,
'rescale_grad': 1.0 / batch_size,
'lr_scheduler': lr_sched}
if args.dtype == 'float16':
optimizer_params['multi_precision'] = True
opt = mx.optimizer.create('sgd', **optimizer_params)
# Horovod: wrap optimizer with DistributedOptimizer
dist_opt = hvd.DistributedOptimizer(opt)
# Setup validation data and callback during training
eval_data = None
if args.eval_epoch:
eval_data = val_data
batch_callback = None
if args.log_interval > 0 and rank == 0:
batch_callback = mx.callback.Speedometer(batch_size * num_workers,
args.log_interval)
epoch_callback = None
if args.save_frequency > 0:
epoch_callback = mx.callback.do_checkpoint(
'%s-%d' % (args.model, rank),
period=args.save_frequency)
def save_checkpoint(net, epoch, top1, best_acc, model_prefix, save_frequency, kvstore):
if model_prefix is None or save_frequency == 0 or ('horovod' in kvstore and hvd.rank() != 0):
return
if save_frequency > 0 and (epoch + 1) % save_frequency == 0:
fname = '{}_{:04}.params'.format(model_prefix, epoch)
net.save_parameters(fname)
logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))
if top1 > best_acc:
fname = '{}_best.params'.format(model_prefix)
net.save_parameters(fname)
logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))