forked from DanielSlater/AlphaToe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
value_network.py
128 lines (93 loc) · 5.17 KB
/
value_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
After using reinforcement learning to train a network, e.g. policy_gradient.py, to play a game well. We then want to
learn to estimate weather that network would win, lose or draw from a given position.
Alpha Go used a database of real positions to get it's predictions from, we don't have that for tic-tac-toe so instead
we generate some random game positions and train off of the results we get playing from those.
"""
import os
import random
import numpy as np
import tensorflow as tf
from common.network_helpers import create_network, load_network, save_network, \
get_deterministic_network_move
from games.tic_tac_toe import TicTacToeGameSpec
HIDDEN_NODES_VALUE = (100, 100, 100)
HIDDEN_NODES_REINFORCEMENT = (100, 100, 100)
BATCH_SIZE = 100 # every how many games to do a parameter update?
LEARN_RATE = 1e-4
REINFORCEMENT_NETWORK_PATH = 'current_network.p'
VALUE_NETWORK_PATH = 'value_netowrk.p'
TRAIN_SAMPLES = 10000
TEST_SAMPLES = 10000
# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()
NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8)
# it would be good to have real board positions, but failing that just generate random ones
def generate_random_board_position():
while True:
board_state = game_spec.new_board()
number_moves = random.randint(*NUMBER_RANDOM_RANGE)
side = 1
for _ in range(number_moves):
board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))),
side)
if game_spec.has_winner(board_state) != 0:
# start again if we hit an already winning position
continue
side = -side
return board_state
reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network(
game_spec.board_squares(),
HIDDEN_NODES_REINFORCEMENT,
game_spec.outputs())
value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), HIDDEN_NODES_VALUE,
output_nodes=1, output_softmax=False)
target_placeholder = tf.placeholder("float", (None, 1))
error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer))
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH)
if os.path.isfile(VALUE_NETWORK_PATH):
print("loading previous version of value network")
load_network(session, value_variables, VALUE_NETWORK_PATH)
def make_move(board_state, side):
move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer,
board_state, side)
return game_spec.flat_move_to_tuple(np.argmax(move))
board_states_training = {}
board_states_test = []
episode_number = 0
while len(board_states_training) < TRAIN_SAMPLES + TEST_SAMPLES:
board_state = generate_random_board_position()
board_state_flat = tuple(np.ravel(board_state))
# only accept the board_state if not already in the dict
if board_state_flat not in board_states_training:
result = game_spec.play_game(make_move, make_move, board_state=board_state)
board_states_training[board_state_flat] = float(result)
# take a random selection from training into a test set
for _ in range(TEST_SAMPLES):
sample = random.choice(board_states_training.keys())
board_states_test.append((sample, board_states_training[sample]))
del board_states_training[sample]
board_states_training = list(board_states_training.iteritems())
test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test],
target_placeholder: [[x[1]] for x in board_states_test]})
while True:
np.random.shuffle(board_states_training)
train_error = 0
for start_index in range(0, len(board_states_training) - BATCH_SIZE + 1, BATCH_SIZE):
mini_batch = board_states_training[start_index:start_index + BATCH_SIZE]
batch_error, _ = session.run([error, train_step],
feed_dict={value_input_layer: [x[0] for x in mini_batch],
target_placeholder: [[x[1]] for x in mini_batch]})
train_error += batch_error
new_test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test],
target_placeholder: [[x[1]] for x in board_states_test]})
print("episode: %s train_error: %s test_error: %s" % (episode_number, train_error, test_error))
if new_test_error > test_error:
print("train error went up, stopping training")
break
test_error = new_test_error
episode_number += 1
save_network(session, value_variables, VALUE_NETWORK_PATH)