-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
110 lines (81 loc) · 2.36 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Author: Carolina Barbosa
import numpy as np
# Matriz
# 2 5
# 1 4
# 0 3
# Estado terminal
terminal = 5
# Possiveis acoes
actions = ['UP','DW','LF','RG']
# Recompensas
rws = np.array([-1]*6)
rws[5] = 10
# Duas trajetorias
paths = [(0, ['UP','UP','UP','RG']), (4, ['RG','RG','LF','UP'])]
# Constantes
alpha = 0.5
gamma = 0.8
def print_value(value):
print('[' + str(value[2]) + ' ' + str(value[5]))
print(str(value[1]) + ' ' + str(value[4]))
print(str(value[0]) + ' ' + str(value[3]) + ']\n')
def update_value(value, state, action):
index = actions.index(action)
next_state = state
rw = 0
if action == 'UP':
if state == 2 or state == 5:
rw = -10
else:
next_state = state + 1
elif action == 'DW':
if state == 0 or state == 3:
rw = -10
else:
next_state = state - 1
elif action == 'LF':
if state == 0 or state == 1 or state == 2:
rw = -10
else:
next_state = state - 3
elif action == 'RG':
if state == 3 or state == 4 or state == 5:
rw = -10
else:
next_state = state + 3
if rw == 0:
rw = rws[next_state]
value[index][state] = value[index][state] + alpha * (rw + gamma * max(value[i][next_state] for i in range(4)) - value[index][state])
#print_value(value[index])
return value, next_state
def return_policy(value):
policy = np.array([' ']*6)
policy[5] = '+10'
for state in range(5):
policy[state] = actions[np.argmax([value[action][state] for action in range(4)])]
print(policy[2] + ' ' + policy[5])
print(policy[1] + ' ' + policy[4])
print(policy[0] + ' ' + policy[3]+ '\n')
def main():
# Inicializar matriz Q com valores 0, considerando as quatro acoes
value = [np.zeros(6),np.zeros(6),np.zeros(6),np.zeros(6)]
for i in range(len(paths)):
state = paths[i][0]
actions = paths[i][1]
for action in actions:
value, state = update_value(value, state, action)
if state == terminal:
break
# Acao UP
print_value(value[0])
# Acao DW
print_value(value[1])
# Acao LF
print_value(value[2])
# Acao RG
print_value(value[3])
# Politica
return_policy(value)
if __name__ == '__main__':
main()