Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, profile=None, cluster_id=None):
self.client = ipp.Client(profile=profile, cluster_id=cluster_id)
self.statusDict = {}
self.sleepSeconds = SLEEP_SECONDS
self.keyField = 'key'
get_ipython().run_line_magic('ipcluster', '--version')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
# Repeat a few of times in case of `TimeoutError`.
# After the cluser starts, the following calls won't do nothing
# but printing "IPCluster is already running".
# This mimics what the user would do in such case.
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
c = ipp.Client()
print('cluster ids:', c.ids)
get_ipython().run_cell_magic('px', '', 'import os\nprint(os.popen("ps -u $USER | grep ip").read())')
get_ipython().run_cell_magic('px', '', 'import socket\nsocket.gethostname()')
get_ipython().run_cell_magic('px', '', 'import numpy as np\nimport tensorflow as tf\nimport horovod.tensorflow as hvd')
get_ipython().run_cell_magic('px', '', 'hvd.init()')
get_ipython().run_cell_magic('px', '', '# Note that the generated rando data is different from one node to the other\nnsamples = 1000\nref_slope = 2.0\nref_offset = 0.0\nnoise = np.random.random((nsamples, 1)) - 0.5\nx_train = np.random.random((nsamples, 1)) - 0.5\ny_train = ref_slope * x_train + ref_offset + noise')
get_ipython().run_cell_magic('px', '', '#input pipeline\ndataset = tf.data.Dataset.from_tensor_slices((x_train.astype(np.float32),\n y_train.astype(np.float32)))\ndataset = dataset.shard(hvd.size(), hvd.rank())\ndataset = dataset.batch(500)\ndataset = dataset.repeat(500)\niterator = dataset.make_one_shot_iterator()\nnext_item = iterator.get_next()')
get_ipython().run_cell_magic('px', '', '# Define the model\nslope = tf.Variable(np.random.randn())\noffset = tf.Variable(np.random.randn())\n\nx, y = next_item # The model is the continuation of the pipeline\n\ny_hat = slope * x + offset\n\nloss = tf.losses.mean_squared_error(y_hat, y)\n\nopt = tf.train.GradientDescentOptimizer(.5)\ntrain = hvd.DistributedOptimizer(opt).minimize(loss)')
def _nengines_up(url_file):
"return the number of engines up"
client = None
try:
client = Client(url_file, timeout=60)
up = len(client.ids)
client.close()
# the controller isn't up yet
except iperror.TimeoutError:
return 0
# the JSON file is not available to parse
except IOError:
return 0
else:
return up
of megabytes you might saturate the network interface of a single node and
potentially its memory buffers if the messages are not consumed in a streamed
manner.
Note that the AllReduce scheme implemented with the spanning tree strategy
impose the aggregation function to be commutative and distributive. It might
not be the case if you implement the naive gather / reduce / broadcast strategy
where you can reorder the partial data before performing the reduce.
"""
from __future__ import print_function
import ipyparallel as ipp
# connect client and create views
rc = ipp.Client()
rc.block=True
ids = rc.ids
root_id = ids[0]
root = rc[root_id]
view = rc[:]
# run bintree.py script defining bintree functions, etc.
exec(compile(open('bintree.py').read(), 'bintree.py', 'exec'))
# generate binary tree of parents
btree = bintree(ids)
print("setting up binary tree interconnect:")
print_bintree(btree)
while True:
try:
result = view.apply_sync(next, ipp.Reference('it'+name))
# This causes the StopIteration exception to be raised.
except RemoteError as e:
if e.ename == 'StopIteration':
raise StopIteration
else:
raise e
else:
yield result
# Main, interactive testing
if __name__ == '__main__':
rc = ipp.Client()
view = rc[:]
print('Engine IDs:', rc.ids)
# Make a set of 'sorted datasets'
a0 = range(5,20)
a1 = range(10)
a2 = range(15,25)
# Now, imagine these had been created in the remote engines by some long
# computation. In this simple example, we just send them over into the
# remote engines. They will all be called 'a' in each engine.
rc[0]['a'] = a0
rc[1]['a'] = a1
rc[2]['a'] = a2
# And we now make a local object which represents the remote iterator
def __init__(self, url_file=None, profile=None, settings=None):
super(ParallelOptimizer, self).__init__(settings)
self.settings = settings if settings else {}
self._client = Client(url_file=url_file, profile=profile)
self._dview = self._client[:]
self._lview = self._client.load_balanced_view()
self._code = ""
@classmethod
def create_from_legacy(cls, ipython_profile):
"""For backwards compatibility, return a parallel profile
based on an argument that can be an LSF objct, an ipython
profile name, False, or None."""
if isinstance(ipython_profile, LSF):
import LSF
elif ipython_profile:
from ipyparallel import Client, LoadBalancedView
client = Client(profile=ipython_profile)
return IPython(client)
return view.imap
elif ipython_profile == False:
return Uniprocessing()
else:
return Multiprocessing()
if backend == 'SLURM':
try:
stop_server()
except:
logger.debug('Nothing to stop')
slurm_script = '/mnt/home/agiovann/SOFTWARE/CaImAn/SLURM/slurmStart.sh' # FIXME: Make this a documented environment variable
logger.info([str(n_processes), slurm_script])
start_server(slurm_script=slurm_script, ncpus=n_processes)
pdir, profile = os.environ['IPPPDIR'], os.environ['IPPPROFILE']
logger.info([pdir, profile])
c = Client(ipython_dir=pdir, profile=profile)
dview = c[:]
elif backend == 'ipyparallel':
stop_server()
start_server(ncpus=n_processes)
c = Client()
logger.info(f'Started ipyparallel cluster: Using {len(c)} processes')
dview = c[:len(c)]
elif (backend == 'multiprocessing') or (backend == 'local'):
if len(multiprocessing.active_children()) > 0:
if ignore_preexisting:
logger.warn('Found an existing multiprocessing pool. '
'This is often indicative of an already-running CaImAn cluster. '
'You have configured the cluster setup to not raise an exception.')
else:
raise Exception(
'A cluster is already runnning. Terminate with dview.terminate() if you want to restart.')
if (platform.system() == 'Darwin') and (sys.version_info > (3, 0)):
try:
if 'kernel' in get_ipython().trait_names(): # type: ignore
# If you're on OSX and you're running under Jupyter or Spyder,
else:
try:
c.close()
except:
print('C was not existing, creating one')
print("Stopping cluster to avoid unnencessary use of memory....")
sys.stdout.flush()
if backend == 'SLURM':
try:
cse.utilities.stop_server(is_slurm=True)
except:
print('Nothing to stop')
slurm_script = '/mnt/xfs1/home/agiovann/SOFTWARE/Constrained_NMF/SLURM/slurmStart.sh'
cse.utilities.start_server(slurm_script=slurm_script)
pdir, profile = os.environ['IPPPDIR'], os.environ['IPPPROFILE']
c = Client(ipython_dir=pdir, profile=profile)
else:
cse.utilities.stop_server()
cse.utilities.start_server()
c = Client()
print(('Using ' + str(len(c)) + ' processes'))
dview = c[:]
#%% get all the right folders
params = [
#['Jan25_2015_07_13',30,False,False,False], # fname, frate, do_rotate_template, do_self_motion_correct, do_motion_correct
#['Jan40_exp2_001',30,False,False,False],
#['Jan42_exp4_001',30,False,False,False],
#['Jan-AMG1_exp2_new_001',30,False,False,False],
#['Jan-AMG_exp3_001',30,False,False,False],
#['Yi.data.001',30,False,True,True],
def waitForWorkers(self):
maxTries = getParamAsInt('IPP.StartupWaitTries')
seconds = getParamAsInt('IPP.StartupWaitSecs')
profile = self.args.profile
clusterId = self.args.clusterId
client = None
for i in range(1, maxTries+1):
if client and len(client) > 0:
return
if client is None:
try:
# default timeout is 10 seconds
self.client = client = ipp.Client(profile=profile, cluster_id=clusterId)
# except IOError:
except Exception as e:
_logger.debug("Error waiting for workers: %s", e)
if i == maxTries - 1:
raise
_logger.info("Waiting for client (%d/%d)", i, maxTries)
sleep(seconds)
continue
if len(client.ids) == 0:
_logger.info("Waiting for engines (%d/%d)", i, maxTries)
sleep(seconds)