How to use the horovod.mxnet function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hpi-xnor / BMXNet-v2 / example / distributed_training-horovod / gluon_mnist.py View on Github external
data_iter.reset()
    metric = mx.metric.Accuracy()
    for _, batch in enumerate(data_iter):
        data = batch.data[0].as_in_context(context)
        label = batch.label[0].as_in_context(context)
        output = model(data.astype(args.dtype, copy=False))
        metric.update([label], [output])

    return metric.get()


# Initialize Horovod
hvd.init()

# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()

# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())

# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()

# Create optimizer
optimizer_params = {'momentum': args.momentum,
                    'learning_rate': args.lr * hvd.size()}
opt = mx.optimizer.create('sgd', **optimizer_params)

# Initialize parameters
github horovod / horovod / examples / mxnet_mnist.py View on Github external
data_iter.reset()
    metric = mx.metric.Accuracy()
    for _, batch in enumerate(data_iter):
        data = batch.data[0].as_in_context(context)
        label = batch.label[0].as_in_context(context)
        output = model(data.astype(args.dtype, copy=False))
        metric.update([label], [output])

    return metric.get()


# Initialize Horovod
hvd.init()

# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()

# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())

# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()

# Create optimizer
optimizer_params = {'momentum': args.momentum,
                    'learning_rate': args.lr * hvd.size()}
opt = mx.optimizer.create('sgd', **optimizer_params)

# Initialize parameters
github polyaxon / polyaxon / examples / in_cluster / horovod / mxnet / mnist.py View on Github external
label = batch.label[0].as_in_context(context)
        output = model(data.astype(args.dtype, copy=False))
        metric.update([label], [output])

    return metric.get()


# Initialize Horovod
hvd.init()

# Polyaxon
if hvd.rank() == 0:
    experiment = Experiment()

# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()

# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())

# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()

# Define hyper parameters
optimizer_params = {'momentum': args.momentum,
                    'learning_rate': args.lr * hvd.size(),
                    'rescale_grad': 1.0 / args.batch_size}

# Add Horovod Distributed Optimizer
github polyaxon / polyaxon-examples / in_cluster / horovod / mxnet / mnist.py View on Github external
# Horovod: pin context to local rank
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
num_workers = hvd.size()

# Load training and validation data
train_data, val_data = get_mnist_iterator(hvd.rank())

# Build model
model = conv_nets()
model.cast(args.dtype)
model.hybridize()

# Define hyper parameters
optimizer_params = {'momentum': args.momentum,
                    'learning_rate': args.lr * hvd.size(),
                    'rescale_grad': 1.0 / args.batch_size}

# Add Horovod Distributed Optimizer
opt = mx.optimizer.create('sgd', **optimizer_params)
opt = hvd.DistributedOptimizer(opt)

# Initialize parameters
initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
                             magnitude=2)
model.initialize(initializer, ctx=context)

# Fetch and broadcast parameters
params = model.collect_params()
if params is not None:
    hvd.broadcast_parameters(params, root_rank=0)
github hpi-xnor / BMXNet-v2 / example / distributed_training-horovod / gluon_mnist.py View on Github external
zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
                             dirname=data_dir)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(data_dir)

    input_shape = (1, 28, 28)
    batch_size = args.batch_size

    train_iter = mx.io.MNISTIter(
        image="%s/train-images-idx3-ubyte" % data_dir,
        label="%s/train-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        shuffle=True,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    val_iter = mx.io.MNISTIter(
        image="%s/t10k-images-idx3-ubyte" % data_dir,
        label="%s/t10k-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        flat=False,
    )

    return train_iter, val_iter
github horovod / horovod / examples / mxnet_imagenet_resnet50.py View on Github external
# Create optimizer
    # Note that when using Module API, we need to specify rescale_grad since
    # we create optimizer first and wrap it with DistributedOptimizer. For
    # Gluon API, it is handled in Trainer.step() function so there is no need
    # to specify rescale_grad (see above train_gluon() function). 
    optimizer_params = {'wd': args.wd,
                        'momentum': args.momentum,
                        'rescale_grad': 1.0 / batch_size,
                        'lr_scheduler': lr_sched}
    if args.dtype == 'float16':
        optimizer_params['multi_precision'] = True
    opt = mx.optimizer.create('sgd', **optimizer_params)

    # Horovod: wrap optimizer with DistributedOptimizer
    dist_opt = hvd.DistributedOptimizer(opt)

    # Setup validation data and callback during training
    eval_data = None
    if args.eval_epoch:
        eval_data = val_data
    batch_callback = None
    if args.log_interval > 0 and rank == 0:
        batch_callback = mx.callback.Speedometer(batch_size * num_workers,
                                                 args.log_interval)

    epoch_callback = None
    if args.save_frequency > 0:
        epoch_callback = mx.callback.do_checkpoint(
            '%s-%d' % (args.model, rank),
            period=args.save_frequency)
github NVIDIA / DeepLearningExamples / MxNet / Classification / RN50v1.5 / fit.py View on Github external
def save_checkpoint(net, epoch, top1, best_acc, model_prefix, save_frequency, kvstore):
    if model_prefix is None or save_frequency == 0 or ('horovod' in kvstore and hvd.rank() != 0):
        return
    if save_frequency > 0 and (epoch + 1) % save_frequency == 0:
        fname = '{}_{:04}.params'.format(model_prefix, epoch)
        net.save_parameters(fname)
        logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))
    if top1 > best_acc:
        fname = '{}_best.params'.format(model_prefix)
        net.save_parameters(fname)
        logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))