Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main():
env = gym.make('CartPole-v0')
model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM)
alg = PolicyGradient(model, LEARNING_RATE)
agent = CartpoleAgent(alg, OBS_DIM, ACT_DIM)
with fluid.dygraph.guard():
for i in range(1000): # 100 episodes
obs_list, action_list, reward_list = run_episode(env, agent)
if i % 10 == 0:
logger.info("Episode {}, Reward Sum {}.".format(
i, sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
_, _, reward_list = run_episode(
env, agent, train_or_test='test')
total_reward = np.sum(reward_list)
logger.info('Test reward: {}'.format(total_reward))
def _load_params(self, dirname):
logger.info('[{}]: Loading model from {}'.format(
self.stage_name, dirname))
fluid.io.load_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=self.ensemble_predict_program)
def log_metrics(self, metrics):
logger.info(metrics)
for k, v in metrics.items():
if v is not None:
tensorboard.add_scalar(k, v, self.sample_total_steps)
'env_reward':
episode_env_reward,
'episode_length':
mem[-1].info['frame_count'],
'falldown':
not mem[-1].info['timeout'],
})
logger.info('{}, finish_cnt: {}'.format(
self.cur_model, len(self.evaluate_result)))
logger.info('{}'.format(self.evaluate_result[-1]))
if len(self.evaluate_result) >= args.evaluate_times:
mean_value = {}
for key in self.evaluate_result[0].keys():
mean_value[key] = np.mean(
[x[key] for x in self.evaluate_result])
logger.info('Model: {}, mean_value: {}'.format(
self.cur_model, mean_value))
eval_num = len(self.evaluate_result)
falldown_num = len(
[x for x in self.evaluate_result if x['falldown']])
falldown_rate = falldown_num / eval_num
logger.info('Falldown rate: {}'.format(falldown_rate))
for key in self.evaluate_result[0].keys():
mean_value[key] = np.mean([
x[key] for x in self.evaluate_result
if not x['falldown']
])
logger.info(
'Model: {}, Exclude falldown, mean_value: {}'.format(
self.cur_model, mean_value))
if mean_value['shaping_reward'] > self.best_shaping_reward:
def restore(agent):
learnDir = os.path.join(logger.get_dir(),'learn')
predictDir = os.path.join(logger.get_dir(),'predict')
print('restore model from {}'.format(learnDir))
agent.load_params(learnDir,predictDir)
def save_rpm(self):
save_path = os.path.join(logger.get_dir(), "rpm.npz")
self.rpm.save(save_path)
def main():
env = gym.make("CartPole-v0")
model = CartpoleModel(act_dim=ACT_DIM)
alg = parl.algorithms.PolicyGradient(model, lr=LEARNING_RATE)
agent = CartpoleAgent(alg, obs_dim=OBS_DIM, act_dim=ACT_DIM)
# if the file already exists, restore parameters from it
if os.path.exists('./model.ckpt'):
agent.restore('./model.ckpt')
for i in range(1000):
obs_list, action_list, reward_list = run_episode(env, agent)
if i % 10 == 0:
logger.info("Episode {}, Reward Sum {}.".format(
i, sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
_, _, reward_list = run_episode(env, agent, train_or_test='test')
total_reward = np.sum(reward_list)
logger.info('Test reward: {}'.format(total_reward))
# save the parameters to ./model.ckpt
agent.save('./model.ckpt')