From 4a0678eba23e4aebc3872b49989c60a4a2092dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zaj=C4=85c?= Date: Tue, 5 Sep 2023 20:43:11 +0200 Subject: [PATCH] Modify .rst docs for GAIL and AIRL to match tutorials --- docs/algorithms/airl.rst | 18 +++++++++++------- docs/algorithms/gail.rst | 15 ++++++++------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/docs/algorithms/airl.rst b/docs/algorithms/airl.rst index 4d38dbe6e..188ca2d89 100644 --- a/docs/algorithms/airl.rst +++ b/docs/algorithms/airl.rst @@ -60,9 +60,13 @@ Detailed example notebook: :doc:`../tutorials/4_train_airl` learner = PPO( env=env, policy=MlpPolicy, - batch_size=16, - learning_rate=0.0001, - n_epochs=2, + batch_size=64, + ent_coef=0.0, + learning_rate=0.0005, + gamma=0.95, + clip_range=0.1, + vf_coef=0.1, + n_epochs=5, seed=SEED, ) reward_net = BasicShapedRewardNet( @@ -72,9 +76,9 @@ Detailed example notebook: :doc:`../tutorials/4_train_airl` ) airl_trainer = AIRL( demonstrations=rollouts, - demo_batch_size=1024, - gen_replay_buffer_capacity=2048, - n_disc_updates_per_round=4, + demo_batch_size=2048, + gen_replay_buffer_capacity=512, + n_disc_updates_per_round=16, venv=env, gen_algo=learner, reward_net=reward_net, @@ -84,7 +88,7 @@ Detailed example notebook: :doc:`../tutorials/4_train_airl` learner_rewards_before_training, _ = evaluate_policy( learner, env, 100, return_episode_rewards=True, ) - airl_trainer.train(20000) + airl_trainer.train(20000) # Train for 2_000_000 steps to match expert. env.seed(SEED) learner_rewards_after_training, _ = evaluate_policy( learner, env, 100, return_episode_rewards=True, diff --git a/docs/algorithms/gail.rst b/docs/algorithms/gail.rst index 27a708704..ea5d0895d 100644 --- a/docs/algorithms/gail.rst +++ b/docs/algorithms/gail.rst @@ -29,7 +29,7 @@ Detailed example notebook: :doc:`../tutorials/3_train_gail` from imitation.data import rollout from imitation.data.wrappers import RolloutInfoWrapper from imitation.policies.serialize import load_policy - from imitation.rewards.reward_nets import BasicShapedRewardNet + from imitation.rewards.reward_nets import BasicRewardNet from imitation.util.networks import RunningNorm from imitation.util.util import make_vec_env @@ -60,11 +60,12 @@ Detailed example notebook: :doc:`../tutorials/3_train_gail` policy=MlpPolicy, batch_size=64, ent_coef=0.0, - learning_rate=0.00001, - n_epochs=1, + learning_rate=0.0004, + gamma=0.95, + n_epochs=5, seed=SEED, ) - reward_net = BasicShapedRewardNet( + reward_net = BasicRewardNet( observation_space=env.observation_space, action_space=env.action_space, normalize_input_layer=RunningNorm, @@ -72,8 +73,8 @@ Detailed example notebook: :doc:`../tutorials/3_train_gail` gail_trainer = GAIL( demonstrations=rollouts, demo_batch_size=1024, - gen_replay_buffer_capacity=2048, - n_disc_updates_per_round=4, + gen_replay_buffer_capacity=512, + n_disc_updates_per_round=8, venv=env, gen_algo=learner, reward_net=reward_net, @@ -86,7 +87,7 @@ Detailed example notebook: :doc:`../tutorials/3_train_gail` ) # train the learner and evaluate again - gail_trainer.train(20000) + gail_trainer.train(20000) # Train for 800_000 steps to match expert. env.seed(SEED) learner_rewards_after_training, _ = evaluate_policy( learner, env, 100, return_episode_rewards=True,