Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.frame_skip = frame_skip
n_frames = stack * (3 * use_color + 1 * (not use_color) + use_rc_frame)
self.frames = deque([], maxlen=(self.frame_skip * (self.stack - 1) + 1))
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_height, self.screen_width, n_frames))
# coordinates
self.coords_ratio = coords_ratio
assert coords_ratio % screen_ratio == 0, (coords_ratio, screen_ratio)
self.coords_screen_ratio = coords_ratio // screen_ratio
self.coords_height = self.original_height // coords_ratio
self.coords_width = self.original_width // coords_ratio
self.coords_shape = (self.coords_height, self.coords_width)
# actions
self.action_names = ['JUMP+LEFT', 'JUMP', 'JUMP+RIGHT', 'LEFT', 'NOOP', 'RIGHT']
self.action_list = [actions[n] for n in self.action_names]
n_actions = len(self.action_list)
self.action_space = spaces.Discrete(n_actions)
self.action_repeat = action_repeat
# miscellaneous
frame_name = 'RGB' if use_color else 'G'
if use_rc_frame: frame_name += 'C'
self.name = 'CustomSuperMarioAllStars_{}_obs{}x{}x{}x{}_qframes{}x{}x{}_skip{}_repeat{}-v0'.format(
level, *self.screen_shape, frame_name, stack, *self.coords_shape, n_actions, frame_skip, action_repeat)
def __init__(self, screen_ratio=4, coords_ratio=4, use_color=True, use_rc_frame=True, stack=3, frame_skip=4, action_repeat=4):
utils.EzPickle.__init__(self, 'montezuma_revenge', 'image')
self.env = gym.make('MontezumaRevengeNoFrameskip-v4').unwrapped
self.ale = self.env.ale
self.ale.setFloat('repeat_action_probability'.encode('utf-8'), 0) # deterministic
self.max_lives = self.ale.lives()
# observations
self.screen_ratio = screen_ratio
self.original_height = 224
self.original_width = 160
self.screen_height = self.original_height // screen_ratio
self.screen_width = self.original_width // screen_ratio
self.screen_shape = (self.screen_height, self.screen_width)
self.use_color = use_color
self.use_rc_frame = use_rc_frame
self.stack = stack
self.frame_skip = frame_skip
n_frames = stack * (3 * use_color + 1 * (not use_color) + use_rc_frame)
def evaluation(session, graph_ops, saver):
saver.restore(session, FLAGS.checkpoint_path)
print "Restored model weights from ", FLAGS.checkpoint_path
monitor_env = gym.make(FLAGS.game)
monitor_env.monitor.start(FLAGS.eval_dir + "/" + FLAGS.experiment + "/eval")
# Unpack graph ops
s = graph_ops["s"]
q_values = graph_ops["q_values"]
# Wrap env with AtariEnvironment helper class
env = AtariEnvironment(gym_env=monitor_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height,
agent_history_length=FLAGS.agent_history_length)
for i_episode in xrange(FLAGS.num_eval_episodes):
s_t = env.get_initial_state()
ep_reward = 0
terminal = False
while not terminal:
monitor_env.render()
def train(session, graph_ops, saver):
# Set up game environments (one per thread)
envs = [gym.make(GAME) for i in range(NUM_CONCURRENT)]
summary_ops = setup_summaries()
summary_op = summary_ops[-1]
# Initialize variables
session.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(SUMMARY_SAVE_PATH, session.graph)
# Start NUM_CONCURRENT training threads
actor_learner_threads = [threading.Thread(target=actor_learner_thread, args=(thread_id, envs[thread_id], session, graph_ops, summary_ops, saver)) for thread_id in range(NUM_CONCURRENT)]
for t in actor_learner_threads:
t.start()
# Show the agents training and write summary statistics
last_summary_time = 0
while True:
def __init__(self):
low = np.array([-5, -5, -5, -5, -5])
high = -np.array([-5, -5, -5, -5, -5])
self.observation_space = gym.spaces.Box(low, high, dtype=np.float32)
self.action_space = gym.spaces.Box(low, high, dtype=np.float32)
self.rng = random.Random()
def dtyped_rand():
return np_random.rand(1, 84, 84).astype(self.dtype)
low, high = -1.0, 3.14
else:
assert False
env.reset.side_effect = [dtyped_rand() for _ in range(steps)]
env.step.side_effect = [
(
dtyped_rand(),
np_random.rand(),
bool(np_random.randint(2)),
{},
)
for _ in range(steps)]
env.action_space = gym.spaces.Discrete(2)
env.observation_space = gym.spaces.Box(
low=low, high=high, shape=(1, 84, 84), dtype=self.dtype)
return env
@pytest.mark.parametrize("wrapper_class", [None, gym.wrappers.TimeLimit])
def test_make_vec_env(env_id, n_envs, wrapper_class, use_subprocess):
env = make_vec_env(env_id, n_envs, use_subprocess=use_subprocess,
wrapper_class=wrapper_class, monitor_dir=None, seed=0)
assert env.num_envs == n_envs
if not use_subprocess:
assert isinstance(env, DummyVecEnv)
if wrapper_class is not None:
assert isinstance(env.envs[0], wrapper_class)
else:
assert isinstance(env.envs[0], Monitor)
else:
assert isinstance(env, SubprocVecEnv)
# Kill subprocesses
env.close()
if __name__ == "__main__":
adj_np = np.ones((5,3,4,4))
adj = tf.placeholder(shape=(5,3,4,4),dtype=tf.float32)
node_feature_np = np.ones((5,1,4,3))
node_feature = tf.placeholder(shape=(5,1,4,3),dtype=tf.float32)
ob_space = {}
atom_type = 5
ob_space['adj'] = gym.Space(shape=[3,5,5])
ob_space['node'] = gym.Space(shape=[1,5,atom_type])
ac_space = gym.spaces.MultiDiscrete([10, 10, 3])
policy = GCNPolicy(name='policy',ob_space=ob_space,ac_space=ac_space)
stochastic = True
env = gym.make('molecule-v0') # in gym format
env.init()
ob = env.reset()
# ob['adj'] = np.repeat(ob['adj'][None],2,axis=0)
# ob['node'] = np.repeat(ob['node'][None],2,axis=0)
print('adj',ob['adj'].shape)
print('node',ob['node'].shape)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(20):
ob = env.reset()
for j in range(0,20):
ac,vpred,debug = policy.act(stochastic,ob)
# if ac[0]==ac[1]:
# print('error')
# from_logits argument ensures transformation into normalized probabilities
weighted_sparse_ce = kls.SparseCategoricalCrossentropy(from_logits=True)
# policy loss is defined by policy gradients, weighted by advantages
# note: we only calculate the loss on the actions we've actually taken
actions = tf.cast(actions, tf.int32)
policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages)
# entropy loss can be calculated via CE over itself
entropy_loss = kls.categorical_crossentropy(logits, logits, from_logits=True)
# here signs are flipped because optimizer minimizes
return policy_loss - self.params['entropy']*entropy_loss
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
env = gym.make('CartPole-v0')
model = Model(num_actions=env.action_space.n)
agent = A2CAgent(model)
rewards_history = agent.train(env)
print("Finished training.")
print("Total Episode Reward: %d out of 200" % agent.test(env, True))
plt.style.use('seaborn')
plt.plot(np.arange(0, len(rewards_history), 25), rewards_history[::25])
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()
self._max_timestep_eval = max_timestep_eval
self._gamma = gamma
self._lambda_ = lambda_
self._c1 = c1
self._c2 = c2
self._eval_every_n = eval_every_n
self._save_every_n = save_every_n
self._done_frac_for_policy_save = done_frac_for_policy_save
self._n_evals = n_evals
self._len_history_for_policy = len_history_for_policy
self._eval_temperatures = eval_temperatures
self._separate_eval = separate_eval
action_space = self.train_env.action_space
assert isinstance(
action_space, (gym.spaces.Discrete, gym.spaces.MultiDiscrete))
if isinstance(action_space, gym.spaces.Discrete):
n_actions = action_space.n
n_controls = 1
else:
(n_controls,) = action_space.nvec.shape
assert n_controls > 0
assert onp.min(action_space.nvec) == onp.max(action_space.nvec), (
"Every control must have the same number of actions.")
n_actions = action_space.nvec[0]
self._n_actions = n_actions
self._n_controls = n_controls
self._rng = trax.get_random_number_generator_and_set_seed(random_seed)
self._rng, key1 = jax_random.split(self._rng, num=2)
vocab_size = policy_and_value_vocab_size