-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_agents.py
98 lines (88 loc) · 2.78 KB
/
eval_agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from simple_env import SimpleEnv
from grid_world import GridWorld
from train_agent_on_pfn import generate_log_dir_path
import os
def run_eval(env_name):
if env_name == "SimpleEnv":
env = SimpleEnv()
elif env_name == "GridWorld":
env = GridWorld()
else:
env = gym.make(env_name)
# This is random actions
print("Random actions")
res = []
for s in range(1, 4):
sum_r = 0
for i in range(100):
obs, _ = env.reset()
while True:
a = env.action_space.sample()
obs, r, term, trunc, _ = env.step(a)
sum_r += r
if term or trunc:
break
res.append(sum_r/100.)
print(res)
res = np.array(res)
print(res.mean(), "+-", res.std())
# This is an expert policy
print("PPO on real environment")
res = []
for s in range(1, 4):
sum_r = 0
agent = PPO.load(f"val_transitions/expert_policies/PPO_{env_name}.zip")
for i in range(100):
obs, _ = env.reset()
while True:
a, _ = agent.predict(obs, deterministic=True)
obs, r, term, trunc, _ = env.step(a)
sum_r += r
if term or trunc:
break
res.append(sum_r / 100.)
print(res)
res = np.array(res)
print(res.mean(), "+-", res.std())
# This is the OSWM agent
print("OSWM PPO best agent")
res = []
for s in range(1, 4):
sum_r = 0
agent = PPO.load(os.path.join(generate_log_dir_path(env_name, s, additional_path="nnenv"), "best_model.zip"))
for i in range(100):
obs, _ = env.reset()
while True:
a, _ = agent.predict(obs, deterministic=True)
obs, r, term, trunc, _ = env.step(a)
sum_r += r
if term or trunc:
break
res.append(sum_r/100.)
print(res)
res = np.array(res)
print(res.mean(), "+-", res.std())
# This is the OSWM agent final
print("OSWM PPO final agent")
res = []
for s in range(1, 4):
sum_r = 0
agent = PPO.load(os.path.join(generate_log_dir_path(env_name, s), "final_model.zip"))
for i in range(100):
obs, _ = env.reset()
while True:
a, _ = agent.predict(obs, deterministic=True)
obs, r, term, trunc, _ = env.step(a)
sum_r += r
if term or trunc:
break
res.append(sum_r/100.)
print(res)
res = np.array(res)
print(res.mean(), "+-", res.std())
if __name__ == '__main__':
environment_name = "Reacher-v4"
run_eval(environment_name)