From 49b387fc732798e2e4ef9c2ec95a8813c2a274fa Mon Sep 17 00:00:00 2001 From: Jonathon Shen Date: Fri, 8 Jun 2018 14:30:22 -0700 Subject: [PATCH] More PEP8 formatting --- rllab/tf/algos/batch_polopt.py | 56 +++---- rllab/tf/algos/npo.py | 78 +++++----- rllab/tf/algos/vpg.py | 83 +++++----- rllab/tf/distributions/base.py | 7 +- rllab/tf/distributions/bernoulli.py | 32 ++-- rllab/tf/distributions/categorical.py | 48 +++--- rllab/tf/distributions/diagonal_gaussian.py | 23 +-- .../tf/distributions/recurrent_categorical.py | 31 ++-- .../recurrent_diagonal_gaussian.py | 3 - rllab/tf/envs/base.py | 8 +- rllab/tf/envs/parallel_vec_env_executor.py | 18 ++- rllab/tf/envs/vec_env_executor.py | 8 +- .../conjugate_gradient_optimizer.py | 142 ++++++++++++------ .../tf/optimizers/penalty_lbfgs_optimizer.py | 58 ++++--- rllab/tf/policies/base.py | 3 - rllab/tf/policies/categorical_gru_policy.py | 60 ++++---- rllab/tf/policies/categorical_lstm_policy.py | 81 +++++----- rllab/tf/policies/categorical_mlp_policy.py | 9 +- rllab/tf/policies/deterministic_mlp_policy.py | 22 ++- rllab/tf/policies/gaussian_gru_policy.py | 16 +- rllab/tf/policies/gaussian_lstm_policy.py | 16 +- rllab/tf/policies/gaussian_mlp_policy.py | 61 +++++--- .../tf/regressors/bernoulli_mlp_regressor.py | 54 ++++--- .../regressors/categorical_mlp_regressor.py | 61 ++++---- .../regressors/deterministic_mlp_regressor.py | 30 ++-- rllab/tf/regressors/gaussian_mlp_regressor.py | 40 +++-- 26 files changed, 584 insertions(+), 464 deletions(-) diff --git a/rllab/tf/algos/batch_polopt.py b/rllab/tf/algos/batch_polopt.py index 933b8222c..28619365e 100644 --- a/rllab/tf/algos/batch_polopt.py +++ b/rllab/tf/algos/batch_polopt.py @@ -14,30 +14,28 @@ class BatchPolopt(RLAlgorithm): This includes various policy gradient methods like vpg, npg, ppo, trpo, etc. """ - def __init__( - self, - env, - policy, - baseline, - scope=None, - n_itr=500, - start_itr=0, - batch_size=5000, - max_path_length=500, - discount=0.99, - gae_lambda=1, - plot=False, - pause_for_plot=False, - center_adv=True, - positive_adv=False, - store_paths=False, - whole_paths=True, - fixed_horizon=False, - sampler_cls=None, - sampler_args=None, - force_batch_sampler=False, - **kwargs - ): + def __init__(self, + env, + policy, + baseline, + scope=None, + n_itr=500, + start_itr=0, + batch_size=5000, + max_path_length=500, + discount=0.99, + gae_lambda=1, + plot=False, + pause_for_plot=False, + center_adv=True, + positive_adv=False, + store_paths=False, + whole_paths=True, + fixed_horizon=False, + sampler_cls=None, + sampler_args=None, + force_batch_sampler=False, + **kwargs): """ :param env: Environment :param policy: Policy @@ -119,7 +117,8 @@ def train(self, sess=None): logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") - params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) + params = self.get_itr_snapshot(itr, + samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) @@ -128,7 +127,11 @@ def train(self, sess=None): logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: - rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) + rollout( + self.env, + self.policy, + animated=True, + max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") @@ -157,4 +160,3 @@ def get_itr_snapshot(self, itr, samples_data): def optimize_policy(self, itr, samples_data): raise NotImplementedError - diff --git a/rllab/tf/algos/npo.py b/rllab/tf/algos/npo.py index 9aed1b31e..ab9c16da8 100644 --- a/rllab/tf/algos/npo.py +++ b/rllab/tf/algos/npo.py @@ -1,6 +1,3 @@ - - - from rllab.misc import ext from rllab.misc.overrides import overrides import rllab.misc.logger as logger @@ -16,13 +13,12 @@ class NPO(BatchPolopt): Natural Policy Optimization. """ - def __init__( - self, - optimizer=None, - optimizer_args=None, - step_size=0.01, - name="NPO", - **kwargs): + def __init__(self, + optimizer=None, + optimizer_args=None, + step_size=0.01, + name="NPO", + **kwargs): if optimizer is None: if optimizer_args is None: optimizer_args = dict() @@ -52,37 +48,52 @@ def init_opt(self): dist = self.policy.distribution old_dist_info_vars = { - k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) + k: tf.placeholder( + tf.float32, + shape=[None] * (1 + is_recurrent) + list(shape), + name='old_%s' % k) for k, shape in dist.dist_info_specs - } - old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] + } + old_dist_info_vars_list = [ + old_dist_info_vars[k] for k in dist.dist_info_keys + ] state_info_vars = { - k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) + k: tf.placeholder( + tf.float32, + shape=[None] * (1 + is_recurrent) + list(shape), + name=k) for k, shape in self.policy.state_info_specs - } - state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] + } + state_info_vars_list = [ + state_info_vars[k] for k in self.policy.state_info_keys + ] if is_recurrent: - valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") + valid_var = tf.placeholder( + tf.float32, shape=[None, None], name="valid") else: valid_var = None - dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) + dist_info_vars = self.policy.dist_info_sym(obs_var, + state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) - lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) + lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, + dist_info_vars) if is_recurrent: - mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) - surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) + mean_kl = tf.reduce_sum( + kl * valid_var) / tf.reduce_sum(valid_var) + surr_loss = -tf.reduce_sum( + lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl, name="reduce_mean_er") - surr_loss = - tf.reduce_mean(lr * advantage_var) + surr_loss = -tf.reduce_mean(lr * advantage_var) input_list = [ - obs_var, - action_var, - advantage_var, - ] + state_info_vars_list + old_dist_info_vars_list + obs_var, + action_var, + advantage_var, + ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) @@ -91,22 +102,21 @@ def init_opt(self): target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, - constraint_name="mean_kl" - ) + constraint_name="mean_kl") return dict() @overrides def optimize_policy(self, itr, samples_data): - all_input_values = tuple(ext.extract( - samples_data, - "observations", "actions", "advantages" - )) + all_input_values = tuple( + ext.extract(samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] - dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] + dist_info_list = [ + agent_infos[k] for k in self.policy.distribution.dist_info_keys + ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: - all_input_values += (samples_data["valids"],) + all_input_values += (samples_data["valids"], ) logger.log("Computing loss before") loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") diff --git a/rllab/tf/algos/vpg.py b/rllab/tf/algos/vpg.py index bbcdfc8d9..5d94ad623 100644 --- a/rllab/tf/algos/vpg.py +++ b/rllab/tf/algos/vpg.py @@ -1,5 +1,3 @@ - - from rllab.misc import logger from rllab.misc import ext from rllab.misc.overrides import overrides @@ -16,15 +14,14 @@ class VPG(BatchPolopt, Serializable): Vanilla Policy Gradient. """ - def __init__( - self, - env, - policy, - baseline, - optimizer=None, - optimizer_args=None, - name="VPG", - **kwargs): + def __init__(self, + env, + policy, + baseline, + optimizer=None, + optimizer_args=None, + name="VPG", + **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict( @@ -39,7 +36,8 @@ def __init__( self.optimizer = optimizer self.opt_info = None self.name = name - super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs) + super(VPG, self).__init__( + env=env, policy=policy, baseline=baseline, **kwargs) @overrides def init_opt(self): @@ -62,71 +60,86 @@ def init_opt(self): dist = self.policy.distribution old_dist_info_vars = { - k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) + k: tf.placeholder( + tf.float32, + shape=[None] * (1 + is_recurrent) + list(shape), + name='old_%s' % k) for k, shape in dist.dist_info_specs - } - old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] + } + old_dist_info_vars_list = [ + old_dist_info_vars[k] for k in dist.dist_info_keys + ] state_info_vars = { - k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) + k: tf.placeholder( + tf.float32, + shape=[None] * (1 + is_recurrent) + list(shape), + name=k) for k, shape in self.policy.state_info_specs - } - state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] + } + state_info_vars_list = [ + state_info_vars[k] for k in self.policy.state_info_keys + ] if is_recurrent: - valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") + valid_var = tf.placeholder( + tf.float32, shape=[None, None], name="valid") else: valid_var = None - dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) + dist_info_vars = self.policy.dist_info_sym(obs_var, + state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: - surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var) - mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) + surr_obj = -tf.reduce_sum(logli * advantage_var * + valid_var) / tf.reduce_sum(valid_var) + mean_kl = tf.reduce_sum( + kl * valid_var) / tf.reduce_sum(valid_var) max_kl = tf.reduce_max(kl * valid_var) else: - surr_obj = - tf.reduce_mean(logli * advantage_var) + surr_obj = -tf.reduce_mean(logli * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) - input_list = [obs_var, action_var, advantage_var] + state_info_vars_list + input_list = [obs_var, action_var, advantage_var + ] + state_info_vars_list if is_recurrent: input_list.append(valid_var) - self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list) + self.optimizer.update_opt( + loss=surr_obj, target=self.policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) - self.opt_info = dict( - f_kl=f_kl, - ) + self.opt_info = dict(f_kl=f_kl, ) @overrides def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") - inputs = ext.extract( - samples_data, - "observations", "actions", "advantages" - ) + inputs = ext.extract(samples_data, "observations", "actions", + "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) if self.policy.recurrent: - inputs += (samples_data["valids"],) - dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] + inputs += (samples_data["valids"], ) + dist_info_list = [ + agent_infos[k] for k in self.policy.distribution.dist_info_keys + ] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) - mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) + mean_kl, max_kl = self.opt_info['f_kl']( + *(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl) diff --git a/rllab/tf/distributions/base.py b/rllab/tf/distributions/base.py index ef225a18a..30cdddf6d 100644 --- a/rllab/tf/distributions/base.py +++ b/rllab/tf/distributions/base.py @@ -1,7 +1,3 @@ - - - - class Distribution(object): @property def dim(self): @@ -19,7 +15,8 @@ def kl(self, old_dist_info, new_dist_info): """ raise NotImplementedError - def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): + def likelihood_ratio_sym(self, x_var, old_dist_info_vars, + new_dist_info_vars): raise NotImplementedError def entropy(self, dist_info): diff --git a/rllab/tf/distributions/bernoulli.py b/rllab/tf/distributions/bernoulli.py index 3ce0c6919..18fc63ff4 100644 --- a/rllab/tf/distributions/bernoulli.py +++ b/rllab/tf/distributions/bernoulli.py @@ -1,5 +1,3 @@ - - from .base import Distribution import tensorflow as tf import numpy as np @@ -35,29 +33,43 @@ def kl(self, old_dist_info, new_dist_info): def sample(self, dist_info): p = np.asarray(dist_info["p"]) - return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p) + return np.cast['int']( + np.random.uniform(low=0., high=1., size=p.shape) < p) - def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars, name="likelihood_ratio_sym"): + def likelihood_ratio_sym(self, + x_var, + old_dist_info_vars, + new_dist_info_vars, + name="likelihood_ratio_sym"): with enclosing_scope(self._name, name): old_p = old_dist_info_vars["p"] new_p = new_dist_info_vars["p"] ndims = old_p.get_shape().ndims - return tf.reduce_prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY), - axis=ndims - 1) + return tf.reduce_prod( + x_var * new_p / (old_p + TINY) + + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY), + axis=ndims - 1) - def log_likelihood_sym(self, x_var, dist_info_vars, name="log_likelihood_sym"): + def log_likelihood_sym(self, + x_var, + dist_info_vars, + name="log_likelihood_sym"): with enclosing_scope(self._name, name): p = dist_info_vars["p"] ndims = p.get_shape().ndims - return tf.reduce_sum(x_var * tf.log(p + TINY) + (1 - x_var) * tf.log(1 - p + TINY), axis=ndims - 1) + return tf.reduce_sum( + x_var * tf.log(p + TINY) + (1 - x_var) * tf.log(1 - p + TINY), + axis=ndims - 1) def log_likelihood(self, xs, dist_info): p = dist_info["p"] - return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1) + return np.sum( + xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1) def entropy(self, dist_info): p = dist_info["p"] - return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1) + return np.sum( + -p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1) @property def dist_info_keys(self): diff --git a/rllab/tf/distributions/categorical.py b/rllab/tf/distributions/categorical.py index fdebe0e8d..7818557f3 100644 --- a/rllab/tf/distributions/categorical.py +++ b/rllab/tf/distributions/categorical.py @@ -7,7 +7,7 @@ def from_onehot(x_var): - ret = np.zeros((len(x_var),), 'int32') + ret = np.zeros((len(x_var), ), 'int32') nonzero_n, nonzero_a = np.nonzero(x_var) ret[nonzero_n] = nonzero_a return ret @@ -18,13 +18,11 @@ def __init__(self, dim, name="Categorical"): self._dim = dim self._name = name weights_var = tf.placeholder( - dtype=tf.float32, - shape=(None, dim), - name="weights" - ) + dtype=tf.float32, shape=(None, dim), name="weights") self._f_sample = tensor_utils.compile_function( inputs=[weights_var], - outputs=tf.multinomial(tf.log(weights_var + 1e-8), num_samples=1)[:, 0], + outputs=tf.multinomial(tf.log(weights_var + 1e-8), + num_samples=1)[:, 0], ) @property @@ -41,9 +39,9 @@ def kl_sym(self, old_dist_info_vars, new_dist_info_vars, name="kl_sym"): ndims = old_prob_var.get_shape().ndims # Assume layout is N * A return tf.reduce_sum( - old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), - axis=ndims - 1 - ) + old_prob_var * + (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), + axis=ndims - 1) def kl(self, old_dist_info, new_dist_info): """ @@ -53,10 +51,13 @@ def kl(self, old_dist_info, new_dist_info): new_prob = new_dist_info["prob"] return np.sum( old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), - axis=-1 - ) + axis=-1) - def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars, name="likelihood_ratio_sym"): + def likelihood_ratio_sym(self, + x_var, + old_dist_info_vars, + new_dist_info_vars, + name="likelihood_ratio_sym"): with enclosing_scope(self._name, name): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] @@ -71,26 +72,32 @@ def entropy_sym(self, dist_info_vars, name="entropy_sym"): probs = dist_info_vars["prob"] return -tf.reduce_sum(probs * tf.log(probs + TINY), axis=1) - def cross_entropy_sym(self, old_dist_info_vars, new_dist_info_vars, name="cross_entropy_sym"): + def cross_entropy_sym(self, + old_dist_info_vars, + new_dist_info_vars, + name="cross_entropy_sym"): with enclosing_scope(self._name, name): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] ndims = old_prob_var.get_shape().ndims # Assume layout is N * A return tf.reduce_sum( - old_prob_var * (- tf.log(new_prob_var + TINY)), - axis=ndims - 1 - ) + old_prob_var * (-tf.log(new_prob_var + TINY)), axis=ndims - 1) def entropy(self, info): probs = info["prob"] return -np.sum(probs * np.log(probs + TINY), axis=1) - def log_likelihood_sym(self, x_var, dist_info_vars, name="log_likelihood_sym"): + def log_likelihood_sym(self, + x_var, + dist_info_vars, + name="log_likelihood_sym"): with enclosing_scope(self._name, name): probs = dist_info_vars["prob"] ndims = probs.get_shape().ndims - return tf.log(tf.reduce_sum(probs * tf.cast(x_var, tf.float32), ndims - 1) + TINY) + return tf.log( + tf.reduce_sum(probs * tf.cast(x_var, tf.float32), ndims - 1) + + TINY) def log_likelihood(self, xs, dist_info): probs = dist_info["prob"] @@ -99,7 +106,7 @@ def log_likelihood(self, xs, dist_info): @property def dist_info_specs(self): - return [("prob", (self.dim,))] + return [("prob", (self.dim, ))] def sample(self, dist_info): return self._f_sample(dist_info["prob"]) @@ -108,4 +115,5 @@ def sample_sym(self, dist_info, name="sample_sym"): with enclosing_scope(self._name, name): probs = dist_info["prob"] samples = tf.multinomial(tf.log(probs + 1e-8), num_samples=1)[:, 0] - return tf.nn.embedding_lookup(np.eye(self.dim, dtype=np.float32), samples) + return tf.nn.embedding_lookup( + np.eye(self.dim, dtype=np.float32), samples) diff --git a/rllab/tf/distributions/diagonal_gaussian.py b/rllab/tf/distributions/diagonal_gaussian.py index ca95b1f65..7e5e0b7d0 100644 --- a/rllab/tf/distributions/diagonal_gaussian.py +++ b/rllab/tf/distributions/diagonal_gaussian.py @@ -1,6 +1,3 @@ - - - import tensorflow as tf import numpy as np from rllab.tf.distributions import Distribution @@ -41,7 +38,7 @@ def kl(self, old_dist_info, new_dist_info): # return TT.sum( # numerator / denominator + TT.log(new_std) - TT.log(old_std ), axis=-1) - def kl_sym(self, old_dist_info_vars, new_dist_info_vars, name = "kl_sym"): + def kl_sym(self, old_dist_info_vars, new_dist_info_vars, name="kl_sym"): with enclosing_scope(self._name, name): old_means = old_dist_info_vars["mean"] old_log_stds = old_dist_info_vars["log_std"] @@ -64,13 +61,20 @@ def kl_sym(self, old_dist_info_vars, new_dist_info_vars, name = "kl_sym"): return tf.reduce_sum( numerator / denominator + new_log_stds - old_log_stds, axis=-1) - def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars, name = "likelihood_ratio_sym"): + def likelihood_ratio_sym(self, + x_var, + old_dist_info_vars, + new_dist_info_vars, + name="likelihood_ratio_sym"): with enclosing_scope(self._name, name): logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars) logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars) return tf.exp(logli_new - logli_old) - def log_likelihood_sym(self, x_var, dist_info_vars, name = "log_likelihood_sym"): + def log_likelihood_sym(self, + x_var, + dist_info_vars, + name="log_likelihood_sym"): with enclosing_scope(self._name, name): means = dist_info_vars["mean"] log_stds = dist_info_vars["log_std"] @@ -97,11 +101,12 @@ def entropy(self, dist_info): log_stds = dist_info["log_std"] return np.sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1) - def entropy_sym(self, dist_info_var, name = "entropy_sym"): + def entropy_sym(self, dist_info_var, name="entropy_sym"): with enclosing_scope(self._name, name): log_std_var = dist_info_var["log_std"] - return tf.reduce_sum(log_std_var + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1) + return tf.reduce_sum( + log_std_var + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1) @property def dist_info_specs(self): - return [("mean", (self.dim,)), ("log_std", (self.dim,))] + return [("mean", (self.dim, )), ("log_std", (self.dim, ))] diff --git a/rllab/tf/distributions/recurrent_categorical.py b/rllab/tf/distributions/recurrent_categorical.py index 8e1e6d8aa..2da9a819c 100644 --- a/rllab/tf/distributions/recurrent_categorical.py +++ b/rllab/tf/distributions/recurrent_categorical.py @@ -26,9 +26,9 @@ def kl_sym(self, old_dist_info_vars, new_dist_info_vars, name="kl_sym"): new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A return tf.reduce_sum( - old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), - axis=2 - ) + old_prob_var * + (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), + axis=2) def kl(self, old_dist_info, new_dist_info): """ @@ -38,10 +38,13 @@ def kl(self, old_dist_info, new_dist_info): new_prob = new_dist_info["prob"] return np.sum( old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), - axis=2 - ) + axis=2) - def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars, name="likelihood_ratio_sym"): + def likelihood_ratio_sym(self, + x_var, + old_dist_info_vars, + new_dist_info_vars, + name="likelihood_ratio_sym"): with enclosing_scope(self._name, name): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] @@ -50,8 +53,7 @@ def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars, na flat_ratios = self._cat.likelihood_ratio_sym( tf.reshape(x_var, tf.stack([-1, a_dim])), dict(prob=tf.reshape(old_prob_var, tf.stack([-1, a_dim]))), - dict(prob=tf.reshape(new_prob_var, tf.stack([-1, a_dim]))) - ) + dict(prob=tf.reshape(new_prob_var, tf.stack([-1, a_dim])))) return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2]) def entropy(self, dist_info): @@ -63,7 +65,8 @@ def entropy_sym(self, dist_info_vars, name="entropy_sym"): probs = dist_info_vars["prob"] return -tf.reduce_sum(probs * tf.log(probs + TINY), 2) - def log_likelihood_sym(self, xs, dist_info_vars, name="log_likelihood_sym"): + def log_likelihood_sym(self, xs, dist_info_vars, + name="log_likelihood_sym"): with enclosing_scope(self._name, name): probs = dist_info_vars["prob"] # Assume layout is N * T * A @@ -71,8 +74,7 @@ def log_likelihood_sym(self, xs, dist_info_vars, name="log_likelihood_sym"): # a_dim = TT.printing.Print("lala")(a_dim) flat_logli = self._cat.log_likelihood_sym( tf.reshape(xs, tf.stack([-1, a_dim])), - dict(prob=tf.reshape(probs, tf.stack((-1, a_dim)))) - ) + dict(prob=tf.reshape(probs, tf.stack((-1, a_dim))))) return tf.reshape(flat_logli, tf.shape(probs)[:2]) def log_likelihood(self, xs, dist_info): @@ -80,12 +82,9 @@ def log_likelihood(self, xs, dist_info): # Assume layout is N * T * A a_dim = tf.shape(probs)[2] flat_logli = self._cat.log_likelihood_sym( - xs.reshape((-1, a_dim)), - dict(prob=probs.reshape((-1, a_dim))) - ) + xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim)))) return flat_logli.reshape(probs.shape[:2]) @property def dist_info_specs(self): - return [("prob", (self.dim,))] - + return [("prob", (self.dim, ))] diff --git a/rllab/tf/distributions/recurrent_diagonal_gaussian.py b/rllab/tf/distributions/recurrent_diagonal_gaussian.py index 2cfa2db62..5e21bdefd 100644 --- a/rllab/tf/distributions/recurrent_diagonal_gaussian.py +++ b/rllab/tf/distributions/recurrent_diagonal_gaussian.py @@ -1,6 +1,3 @@ - - - from rllab.tf.distributions import DiagonalGaussian RecurrentDiagonalGaussian = DiagonalGaussian diff --git a/rllab/tf/envs/base.py b/rllab/tf/envs/base.py index 45e402303..5dff510c5 100644 --- a/rllab/tf/envs/base.py +++ b/rllab/tf/envs/base.py @@ -27,7 +27,8 @@ def __init__(self, cls, env_cls, extra_kwargs): self.extra_kwargs = extra_kwargs def __call__(self, *args, **kwargs): - return self.cls(self.env_cls(*args, **dict(self.extra_kwargs, **kwargs))) + return self.cls( + self.env_cls(*args, **dict(self.extra_kwargs, **kwargs))) class TfEnv(ProxyEnv): @@ -51,7 +52,9 @@ def vectorized(self): return getattr(self.wrapped_env, "vectorized", False) def vec_env_executor(self, n_envs, max_path_length): - return VecTfEnv(self.wrapped_env.vec_env_executor(n_envs=n_envs, max_path_length=max_path_length)) + return VecTfEnv( + self.wrapped_env.vec_env_executor( + n_envs=n_envs, max_path_length=max_path_length)) @classmethod def wrap(cls, env_cls, **extra_kwargs): @@ -60,7 +63,6 @@ def wrap(cls, env_cls, **extra_kwargs): class VecTfEnv(object): - def __init__(self, vec_env): self.vec_env = vec_env diff --git a/rllab/tf/envs/parallel_vec_env_executor.py b/rllab/tf/envs/parallel_vec_env_executor.py index 1826fa88d..adae52ddc 100644 --- a/rllab/tf/envs/parallel_vec_env_executor.py +++ b/rllab/tf/envs/parallel_vec_env_executor.py @@ -1,5 +1,3 @@ - - import numpy as np import pickle as pickle from rllab.tf.misc import tensor_utils @@ -14,12 +12,14 @@ def worker_init_envs(G, alloc, scope, env): if not hasattr(G, 'parallel_vec_envs'): G.parallel_vec_envs = dict() G.parallel_vec_env_template = dict() - G.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env))) for idx in alloc] + G.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env))) + for idx in alloc] G.parallel_vec_env_template[scope] = env # For these two methods below, we pack the data into batch numpy arrays whenever possible, to reduce communication cost + def worker_run_reset(G, flags, scope): if not hasattr(G, 'parallel_vec_envs'): logger.log("on worker %d" % G.worker_id) @@ -91,7 +91,8 @@ def __init__(self, env, n, max_path_length, scope=None): start_id += n_allocs rest_alloc = max(0, rest_alloc - envs_per_worker) - singleton_pool.run_each(worker_init_envs, [(alloc, scope, env) for alloc in alloc_env_ids]) + singleton_pool.run_each( + worker_init_envs, [(alloc, scope, env) for alloc in alloc_env_ids]) self._alloc_env_ids = alloc_env_ids self._action_space = env.action_space @@ -112,7 +113,8 @@ def step(self, action_n): obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) - env_infos = tensor_utils.split_tensor_dict_list(tensor_utils.concat_tensor_dict_list(env_infos)) + env_infos = tensor_utils.split_tensor_dict_list( + tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] @@ -133,7 +135,8 @@ def step(self, action_n): if done: obs[i] = reset_obs[i] self.ts[i] = 0 - return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(list(env_infos)) + return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( + list(env_infos)) def _run_reset(self, dones): dones = np.asarray(dones) @@ -143,7 +146,8 @@ def _run_reset(self, dones): ) ids, flat_obs = list(map(np.concatenate, list(zip(*results)))) zipped = list(zip(ids, flat_obs)) - sorted_obs = np.asarray([x[1] for x in sorted(zipped, key=lambda x: x[0])]) + sorted_obs = np.asarray( + [x[1] for x in sorted(zipped, key=lambda x: x[0])]) done_ids, = np.where(dones) done_flat_obs = sorted_obs[done_ids] diff --git a/rllab/tf/envs/vec_env_executor.py b/rllab/tf/envs/vec_env_executor.py index 1f462b03d..11e777cd3 100644 --- a/rllab/tf/envs/vec_env_executor.py +++ b/rllab/tf/envs/vec_env_executor.py @@ -1,5 +1,3 @@ - - import numpy as np import pickle as pickle from rllab.tf.misc import tensor_utils @@ -15,7 +13,8 @@ def __init__(self, envs, max_path_length): def step(self, action_n): all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] - obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results)))) + obs, rewards, dones, env_infos = list( + map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 @@ -25,7 +24,8 @@ def step(self, action_n): if done: obs[i] = self.envs[i].reset() self.ts[i] = 0 - return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos) + return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( + env_infos) def reset(self): results = [env.reset() for env in self.envs] diff --git a/rllab/tf/optimizers/conjugate_gradient_optimizer.py b/rllab/tf/optimizers/conjugate_gradient_optimizer.py index 9535489b4..eb7592f16 100644 --- a/rllab/tf/optimizers/conjugate_gradient_optimizer.py +++ b/rllab/tf/optimizers/conjugate_gradient_optimizer.py @@ -25,27 +25,34 @@ def update_opt(self, f, target, inputs, reg_coeff, name="update_opt"): self.reg_coeff = reg_coeff params = target.get_params(trainable=True) - constraint_grads = tf.gradients(f, xs=params, name="constraint_gradients") + constraint_grads = tf.gradients( + f, xs=params, name="constraint_gradients") for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) - xs = tuple([tensor_utils.new_tensor_like(p.name.split(":")[0], p) for p in params]) + xs = tuple([ + tensor_utils.new_tensor_like(p.name.split(":")[0], p) + for p in params + ]) def Hx_plain(original_scope_name=tf.get_variable_scope().name): with tf.variable_scope(original_scope_name): with enclosing_scope(name, "Hx_plain"): Hx_plain_splits = tf.gradients( tf.reduce_sum( - tf.stack([tf.reduce_sum(g * x) for g, x in zip(constraint_grads, xs)]) - ), + tf.stack([ + tf.reduce_sum(g * x) + for g, x in zip(constraint_grads, xs) + ])), params, - name="Hx_plain_gradients" - ) - for idx, (Hx, param) in enumerate(zip(Hx_plain_splits, params)): + name="Hx_plain_gradients") + for idx, (Hx, param) in enumerate( + zip(Hx_plain_splits, params)): if Hx is None: Hx_plain_splits[idx] = tf.zeros_like(param) - return tensor_utils.flatten_tensor_variables(Hx_plain_splits) + return tensor_utils.flatten_tensor_variables( + Hx_plain_splits) self.opt_fun = ext.lazydict( f_Hx_plain=lambda: tensor_utils.compile_function( @@ -58,14 +65,20 @@ def Hx_plain(original_scope_name=tf.get_variable_scope().name): def build_eval(self, inputs): def eval(x): xs = tuple(self.target.flat_to_params(x, trainable=True)) - ret = sliced_fun(self.opt_fun["f_Hx_plain"], self._num_slices)(inputs, xs) + self.reg_coeff * x + ret = sliced_fun(self.opt_fun["f_Hx_plain"], self._num_slices)( + inputs, xs) + self.reg_coeff * x return ret return eval class FiniteDifferenceHvp(object): - def __init__(self, base_eps=1e-8, symmetric=True, grad_clip=None, num_slices=1, name="FiniteDifferenceHvp"): + def __init__(self, + base_eps=1e-8, + symmetric=True, + grad_clip=None, + num_slices=1, + name="FiniteDifferenceHvp"): self.base_eps = base_eps self.symmetric = symmetric self.grad_clip = grad_clip @@ -79,7 +92,8 @@ def update_opt(self, f, target, inputs, reg_coeff, name="update_opt"): params = target.get_params(trainable=True) - constraint_grads = tf.gradients(f, xs=params, name="constraint_gradients") + constraint_grads = tf.gradients( + f, xs=params, name="constraint_gradients") for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) @@ -89,14 +103,17 @@ def update_opt(self, f, target, inputs, reg_coeff, name="update_opt"): def f_Hx_plain(*args): inputs_ = args[:len(inputs)] xs = args[len(inputs):] - flat_xs = np.concatenate([np.reshape(x, (-1,)) for x in xs]) + flat_xs = np.concatenate([np.reshape(x, (-1, )) for x in xs]) param_val = self.target.get_param_values(trainable=True) - eps = np.cast['float32'](self.base_eps / (np.linalg.norm(param_val) + 1e-8)) - self.target.set_param_values(param_val + eps * flat_xs, trainable=True) + eps = np.cast['float32']( + self.base_eps / (np.linalg.norm(param_val) + 1e-8)) + self.target.set_param_values( + param_val + eps * flat_xs, trainable=True) flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_) self.target.set_param_values(param_val, trainable=True) if self.symmetric: - self.target.set_param_values(param_val - eps * flat_xs, trainable=True) + self.target.set_param_values( + param_val - eps * flat_xs, trainable=True) flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps) self.target.set_param_values(param_val, trainable=True) @@ -117,7 +134,8 @@ def f_Hx_plain(*args): def build_eval(self, inputs): def eval(x): xs = tuple(self.target.flat_to_params(x, trainable=True)) - ret = sliced_fun(self.opt_fun["f_Hx_plain"], self._num_slices)(inputs,xs) + self.reg_coeff * x + ret = sliced_fun(self.opt_fun["f_Hx_plain"], self._num_slices)( + inputs, xs) + self.reg_coeff * x return ret return eval @@ -130,18 +148,17 @@ class ConjugateGradientOptimizer(Serializable): of the loss function. """ - def __init__( - self, - cg_iters=10, - reg_coeff=1e-5, - subsample_factor=1., - backtrack_ratio=0.8, - max_backtracks=15, - debug_nan=False, - accept_violation=False, - hvp_approach=None, - num_slices=1, - name="ConjugateGradientOptimizer"): + def __init__(self, + cg_iters=10, + reg_coeff=1e-5, + subsample_factor=1., + backtrack_ratio=0.8, + max_backtracks=15, + debug_nan=False, + accept_violation=False, + hvp_approach=None, + num_slices=1, + name="ConjugateGradientOptimizer"): """ :param cg_iters: The number of CG iterations used to calculate A^-1 g @@ -173,8 +190,16 @@ def __init__( hvp_approach = PerlmutterHvp(num_slices) self._hvp_approach = hvp_approach - def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, name="update_opt", - constraint_name="constraint", *args, **kwargs): + def update_opt(self, + loss, + target, + leq_constraint, + inputs, + extra_inputs=None, + name="update_opt", + constraint_name="constraint", + *args, + **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the @@ -201,8 +226,11 @@ def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, na grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(grads) - self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs, - reg_coeff=self._reg_coeff) + self._hvp_approach.update_opt( + f=constraint_term, + target=target, + inputs=inputs + extra_inputs, + reg_coeff=self._reg_coeff) self._target = target self._max_constraint_val = constraint_value @@ -235,15 +263,20 @@ def loss(self, inputs, extra_inputs=None): inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() - return sliced_fun(self._opt_fun["f_loss"], self._num_slices)(inputs, extra_inputs) + return sliced_fun(self._opt_fun["f_loss"], + self._num_slices)(inputs, extra_inputs) def constraint_val(self, inputs, extra_inputs=None): inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() - return sliced_fun(self._opt_fun["f_constraint"], self._num_slices)(inputs, extra_inputs) + return sliced_fun(self._opt_fun["f_constraint"], + self._num_slices)(inputs, extra_inputs) - def optimize(self, inputs, extra_inputs=None, subsample_grouped_inputs=None): + def optimize(self, + inputs, + extra_inputs=None, + subsample_grouped_inputs=None): prev_param = np.copy(self._target.get_param_values(trainable=True)) inputs = tuple(inputs) if extra_inputs is None: @@ -256,19 +289,25 @@ def optimize(self, inputs, extra_inputs=None, subsample_grouped_inputs=None): for inputs_grouped in subsample_grouped_inputs: n_samples = len(inputs_grouped[0]) inds = np.random.choice( - n_samples, int(n_samples * self._subsample_factor), replace=False) + n_samples, + int(n_samples * self._subsample_factor), + replace=False) subsample_inputs += tuple([x[inds] for x in inputs_grouped]) else: subsample_inputs = inputs - logger.log("Start CG optimization: #parameters: %d, #inputs: %d, #subsample_inputs: %d"%(len(prev_param),len(inputs[0]), len(subsample_inputs[0]))) + logger.log( + "Start CG optimization: #parameters: %d, #inputs: %d, #subsample_inputs: %d" + % (len(prev_param), len(inputs[0]), len(subsample_inputs[0]))) logger.log("computing loss before") - loss_before = sliced_fun(self._opt_fun["f_loss"], self._num_slices)(inputs, extra_inputs) + loss_before = sliced_fun(self._opt_fun["f_loss"], + self._num_slices)(inputs, extra_inputs) logger.log("performing update") logger.log("computing gradient") - flat_g = sliced_fun(self._opt_fun["f_grad"], self._num_slices)(inputs, extra_inputs) + flat_g = sliced_fun(self._opt_fun["f_grad"], + self._num_slices)(inputs, extra_inputs) logger.log("gradient computed") logger.log("computing descent direction") @@ -277,8 +316,8 @@ def optimize(self, inputs, extra_inputs=None, subsample_grouped_inputs=None): descent_direction = krylov.cg(Hx, flat_g, cg_iters=self._cg_iters) initial_step_size = np.sqrt( - 2.0 * self._max_constraint_val * (1. / (descent_direction.dot(Hx(descent_direction)) + 1e-8)) - ) + 2.0 * self._max_constraint_val * + (1. / (descent_direction.dot(Hx(descent_direction)) + 1e-8))) if np.isnan(initial_step_size): initial_step_size = 1. flat_descent_step = initial_step_size * descent_direction @@ -286,28 +325,33 @@ def optimize(self, inputs, extra_inputs=None, subsample_grouped_inputs=None): logger.log("descent direction computed") n_iter = 0 - for n_iter, ratio in enumerate(self._backtrack_ratio ** np.arange(self._max_backtracks)): + for n_iter, ratio in enumerate(self._backtrack_ratio + **np.arange(self._max_backtracks)): cur_step = ratio * flat_descent_step cur_param = prev_param - cur_step self._target.set_param_values(cur_param, trainable=True) - loss, constraint_val = sliced_fun(self._opt_fun["f_loss_constraint"], self._num_slices)(inputs, - extra_inputs) + loss, constraint_val = sliced_fun( + self._opt_fun["f_loss_constraint"], + self._num_slices)(inputs, extra_inputs) if self._debug_nan and np.isnan(constraint_val): - import ipdb; + import ipdb ipdb.set_trace() if loss < loss_before and constraint_val <= self._max_constraint_val: break - if (np.isnan(loss) or np.isnan(constraint_val) or loss >= loss_before or constraint_val >= - self._max_constraint_val) and not self._accept_violation: + if (np.isnan(loss) or np.isnan(constraint_val) or loss >= loss_before + or constraint_val >= self._max_constraint_val + ) and not self._accept_violation: logger.log("Line search condition violated. Rejecting the step!") if np.isnan(loss): logger.log("Violated because loss is NaN") if np.isnan(constraint_val): - logger.log("Violated because constraint %s is NaN" % self._constraint_name) + logger.log("Violated because constraint %s is NaN" % + self._constraint_name) if loss >= loss_before: logger.log("Violated because loss not improving") if constraint_val >= self._max_constraint_val: - logger.log("Violated because constraint %s is violated" % self._constraint_name) + logger.log("Violated because constraint %s is violated" % + self._constraint_name) self._target.set_param_values(prev_param, trainable=True) logger.log("backtrack iters: %d" % n_iter) logger.log("computing loss after") diff --git a/rllab/tf/optimizers/penalty_lbfgs_optimizer.py b/rllab/tf/optimizers/penalty_lbfgs_optimizer.py index f455d9f22..c5e439f3e 100644 --- a/rllab/tf/optimizers/penalty_lbfgs_optimizer.py +++ b/rllab/tf/optimizers/penalty_lbfgs_optimizer.py @@ -14,17 +14,16 @@ class PenaltyLbfgsOptimizer(Serializable): the constraint is satisfied. """ - def __init__( - self, - max_opt_itr=20, - initial_penalty=1.0, - min_penalty=1e-2, - max_penalty=1e6, - increase_penalty_factor=2, - decrease_penalty_factor=0.5, - max_penalty_itr=10, - adapt_penalty=True, - name="PenaltyLbfgsOptimizer"): + def __init__(self, + max_opt_itr=20, + initial_penalty=1.0, + min_penalty=1e-2, + max_penalty=1e6, + increase_penalty_factor=2, + decrease_penalty_factor=0.5, + max_penalty_itr=10, + adapt_penalty=True, + name="PenaltyLbfgsOptimizer"): Serializable.quick_init(self, locals()) self._name = name self._max_opt_itr = max_opt_itr @@ -42,7 +41,15 @@ def __init__( self._max_constraint_val = None self._constraint_name = None - def update_opt(self, loss, target, leq_constraint, inputs, constraint_name="constraint", name="PenaltyLbfgsOptimizer", *args, **kwargs): + def update_opt(self, + loss, + target, + leq_constraint, + inputs, + constraint_name="constraint", + name="PenaltyLbfgsOptimizer", + *args, + **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the @@ -96,8 +103,8 @@ def optimize(self, inputs): inputs = tuple(inputs) - try_penalty = np.clip( - self._penalty, self._min_penalty, self._max_penalty) + try_penalty = np.clip(self._penalty, self._min_penalty, + self._max_penalty) penalty_scale_factor = None f_opt = self._opt_fun["f_opt"] @@ -106,25 +113,28 @@ def optimize(self, inputs): def gen_f_opt(penalty): def f(flat_params): self._target.set_param_values(flat_params, trainable=True) - return f_opt(*(inputs + (penalty,))) + return f_opt(*(inputs + (penalty, ))) return f - cur_params = self._target.get_param_values(trainable=True).astype('float64') + cur_params = self._target.get_param_values( + trainable=True).astype('float64') opt_params = cur_params for penalty_itr in range(self._max_penalty_itr): logger.log('trying penalty=%.3f...' % try_penalty) itr_opt_params, _, _ = scipy.optimize.fmin_l_bfgs_b( - func=gen_f_opt(try_penalty), x0=cur_params, - maxiter=self._max_opt_itr - ) + func=gen_f_opt(try_penalty), + x0=cur_params, + maxiter=self._max_opt_itr) - _, try_loss, try_constraint_val = f_penalized_loss(*(inputs + (try_penalty,))) + _, try_loss, try_constraint_val = f_penalized_loss( + *(inputs + (try_penalty, ))) logger.log('penalty %f => loss %f, %s %f' % - (try_penalty, try_loss, self._constraint_name, try_constraint_val)) + (try_penalty, try_loss, self._constraint_name, + try_constraint_val)) # Either constraint satisfied, or we are at the last iteration already and no alternative parameter # satisfies the constraint @@ -138,7 +148,8 @@ def f(flat_params): # Decide scale factor on the first iteration, or if constraint violation yields numerical error if penalty_scale_factor is None or np.isnan(try_constraint_val): # Increase penalty if constraint violated, or if constraint term is NAN - if try_constraint_val > self._max_constraint_val or np.isnan(try_constraint_val): + if try_constraint_val > self._max_constraint_val or np.isnan( + try_constraint_val): penalty_scale_factor = self._increase_penalty_factor else: # Otherwise (i.e. constraint satisfied), shrink penalty @@ -152,7 +163,8 @@ def f(flat_params): try_constraint_val >= self._max_constraint_val: break try_penalty *= penalty_scale_factor - try_penalty = np.clip(try_penalty, self._min_penalty, self._max_penalty) + try_penalty = np.clip(try_penalty, self._min_penalty, + self._max_penalty) self._penalty = try_penalty self._target.set_param_values(opt_params, trainable=True) diff --git a/rllab/tf/policies/base.py b/rllab/tf/policies/base.py index 804d55374..a3eb9944f 100644 --- a/rllab/tf/policies/base.py +++ b/rllab/tf/policies/base.py @@ -1,6 +1,3 @@ - - - from rllab.tf.core import Parameterized diff --git a/rllab/tf/policies/categorical_gru_policy.py b/rllab/tf/policies/categorical_gru_policy.py index 464b118f0..c5280d7d1 100644 --- a/rllab/tf/policies/categorical_gru_policy.py +++ b/rllab/tf/policies/categorical_gru_policy.py @@ -44,10 +44,7 @@ def __init__( else: input_dim = obs_dim - l_input = L.InputLayer( - shape=(None, None, input_dim), - name="input" - ) + l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim @@ -68,26 +65,28 @@ def __init__( ) prob_network = GRUNetwork( - input_shape=(feature_dim,), + input_shape=(feature_dim, ), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, gru_layer_cls=gru_layer_cls, - name="prob_network" - ) + name="prob_network") self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action - flat_input_var = tf.placeholder(dtype=tf.float32, shape=(None, input_dim), name="flat_input") + flat_input_var = tf.placeholder( + dtype=tf.float32, shape=(None, input_dim), name="flat_input") if feature_network is None: feature_var = flat_input_var else: - feature_var = L.get_output(l_flat_feature, {feature_network.input_layer: flat_input_var}) + feature_var = L.get_output( + l_flat_feature, + {feature_network.input_layer: flat_input_var}) self.f_step_prob = tensor_utils.compile_function( [ @@ -97,8 +96,7 @@ def __init__( L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer - ], {prob_network.step_input_layer: feature_var}) - ) + ], {prob_network.step_input_layer: feature_var})) self.input_dim = input_dim self.action_dim = action_dim @@ -123,25 +121,25 @@ def dist_info_sym(self, obs_var, state_info_vars, name="dist_info_sym"): obs_var = tf.reshape(obs_var, tf.stack([n_batches, n_steps, -1])) obs_var = tf.cast(obs_var, tf.float32) if self.state_include_action: - prev_action_var = tf.cast(state_info_vars["prev_action"], tf.float32) - all_input_var = tf.concat(axis=2, values=[obs_var, prev_action_var]) + prev_action_var = tf.cast(state_info_vars["prev_action"], + tf.float32) + all_input_var = tf.concat( + axis=2, values=[obs_var, prev_action_var]) else: all_input_var = obs_var if self.feature_network is None: return dict( - prob=L.get_output( - self.prob_network.output_layer, - {self.l_input: all_input_var} - ) - ) + prob=L.get_output(self.prob_network.output_layer, + {self.l_input: all_input_var})) else: - flat_input_var = tf.reshape(all_input_var, (-1, self.input_dim)) + flat_input_var = tf.reshape(all_input_var, + (-1, self.input_dim)) return dict( prob=L.get_output( - self.prob_network.output_layer, - {self.l_input: all_input_var, self.feature_network.input_layer: flat_input_var} - ) - ) + self.prob_network.output_layer, { + self.l_input: all_input_var, + self.feature_network.input_layer: flat_input_var + })) @property def vectorized(self): @@ -152,11 +150,13 @@ def reset(self, dones=None): dones = [True] dones = np.asarray(dones) if self.prev_actions is None or len(dones) != len(self.prev_actions): - self.prev_actions = np.zeros((len(dones), self.action_space.flat_dim)) + self.prev_actions = np.zeros((len(dones), + self.action_space.flat_dim)) self.prev_hiddens = np.zeros((len(dones), self.hidden_dim)) self.prev_actions[dones] = 0. - self.prev_hiddens[dones] = self.prob_network.hid_init_param.eval() # get_value() + self.prev_hiddens[ + dones] = self.prob_network.hid_init_param.eval() # get_value() # The return value is a pair. The first item is a matrix (N, A), where each # entry corresponds to the action value taken. The second item is a vector @@ -172,14 +172,12 @@ def get_actions(self, observations): flat_obs = self.observation_space.flatten_n(observations) if self.state_include_action: assert self.prev_actions is not None - all_input = np.concatenate([ - flat_obs, - self.prev_actions - ], axis=-1) + all_input = np.concatenate([flat_obs, self.prev_actions], axis=-1) else: all_input = flat_obs probs, hidden_vec = self.f_step_prob(all_input, self.prev_hiddens) - actions = special.weighted_sample_n(probs, np.arange(self.action_space.n)) + actions = special.weighted_sample_n(probs, + np.arange(self.action_space.n)) prev_actions = self.prev_actions self.prev_actions = self.action_space.flatten_n(actions) self.prev_hiddens = hidden_vec @@ -201,7 +199,7 @@ def distribution(self): def state_info_specs(self): if self.state_include_action: return [ - ("prev_action", (self.action_dim,)), + ("prev_action", (self.action_dim, )), ] else: return [] diff --git a/rllab/tf/policies/categorical_lstm_policy.py b/rllab/tf/policies/categorical_lstm_policy.py index 480f34ee7..27c54a1ce 100644 --- a/rllab/tf/policies/categorical_lstm_policy.py +++ b/rllab/tf/policies/categorical_lstm_policy.py @@ -15,19 +15,17 @@ class CategoricalLSTMPolicy(StochasticPolicy, LayersPowered, Serializable): - def __init__( - self, - env_spec, - name="CategoricalLSTMPolicy", - hidden_dim=32, - feature_network=None, - prob_network=None, - state_include_action=True, - hidden_nonlinearity=tf.tanh, - forget_bias=1.0, - use_peepholes=False, - lstm_layer_cls=L.LSTMLayer - ): + def __init__(self, + env_spec, + name="CategoricalLSTMPolicy", + hidden_dim=32, + feature_network=None, + prob_network=None, + state_include_action=True, + hidden_nonlinearity=tf.tanh, + forget_bias=1.0, + use_peepholes=False, + lstm_layer_cls=L.LSTMLayer): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer @@ -47,10 +45,7 @@ def __init__( else: input_dim = obs_dim - l_input = L.InputLayer( - shape=(None, None, input_dim), - name="input" - ) + l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim @@ -72,7 +67,7 @@ def __init__( if prob_network is None: prob_network = LSTMNetwork( - input_shape=(feature_dim,), + input_shape=(feature_dim, ), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, @@ -81,19 +76,21 @@ def __init__( forget_bias=forget_bias, use_peepholes=use_peepholes, lstm_layer_cls=lstm_layer_cls, - name="prob_network" - ) + name="prob_network") self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action - flat_input_var = tf.placeholder(dtype=tf.float32, shape=(None, input_dim), name="flat_input") + flat_input_var = tf.placeholder( + dtype=tf.float32, shape=(None, input_dim), name="flat_input") if feature_network is None: feature_var = flat_input_var else: - feature_var = L.get_output(l_flat_feature, {feature_network.input_layer: flat_input_var}) + feature_var = L.get_output( + l_flat_feature, + {feature_network.input_layer: flat_input_var}) self.f_step_prob = tensor_utils.compile_function( [ @@ -105,8 +102,7 @@ def __init__( prob_network.step_output_layer, prob_network.step_hidden_layer, prob_network.step_cell_layer - ], {prob_network.step_input_layer: feature_var}) - ) + ], {prob_network.step_input_layer: feature_var})) self.input_dim = input_dim self.action_dim = action_dim @@ -134,24 +130,23 @@ def dist_info_sym(self, obs_var, state_info_vars, name="dist_info_sym"): if self.state_include_action: prev_action_var = state_info_vars["prev_action"] prev_action_var = tf.cast(prev_action_var, tf.float32) - all_input_var = tf.concat(axis=2, values=[obs_var, prev_action_var]) + all_input_var = tf.concat( + axis=2, values=[obs_var, prev_action_var]) else: all_input_var = obs_var if self.feature_network is None: return dict( - prob=L.get_output( - self.prob_network.output_layer, - {self.l_input: all_input_var} - ) - ) + prob=L.get_output(self.prob_network.output_layer, + {self.l_input: all_input_var})) else: - flat_input_var = tf.reshape(all_input_var, (-1, self.input_dim)) + flat_input_var = tf.reshape(all_input_var, + (-1, self.input_dim)) return dict( prob=L.get_output( - self.prob_network.output_layer, - {self.l_input: all_input_var, self.feature_network.input_layer: flat_input_var} - ) - ) + self.prob_network.output_layer, { + self.l_input: all_input_var, + self.feature_network.input_layer: flat_input_var + })) @property def vectorized(self): @@ -162,7 +157,8 @@ def reset(self, dones=None): dones = [True] dones = np.asarray(dones) if self.prev_actions is None or len(dones) != len(self.prev_actions): - self.prev_actions = np.zeros((len(dones), self.action_space.flat_dim)) + self.prev_actions = np.zeros((len(dones), + self.action_space.flat_dim)) self.prev_hiddens = np.zeros((len(dones), self.hidden_dim)) self.prev_cells = np.zeros((len(dones), self.hidden_dim)) @@ -184,14 +180,13 @@ def get_actions(self, observations): flat_obs = self.observation_space.flatten_n(observations) if self.state_include_action: assert self.prev_actions is not None - all_input = np.concatenate([ - flat_obs, - self.prev_actions - ], axis=-1) + all_input = np.concatenate([flat_obs, self.prev_actions], axis=-1) else: all_input = flat_obs - probs, hidden_vec, cell_vec = self.f_step_prob(all_input, self.prev_hiddens, self.prev_cells) - actions = special.weighted_sample_n(probs, np.arange(self.action_space.n)) + probs, hidden_vec, cell_vec = self.f_step_prob( + all_input, self.prev_hiddens, self.prev_cells) + actions = special.weighted_sample_n(probs, + np.arange(self.action_space.n)) prev_actions = self.prev_actions self.prev_actions = self.action_space.flatten_n(actions) self.prev_hiddens = hidden_vec @@ -214,7 +209,7 @@ def distribution(self): def state_info_specs(self): if self.state_include_action: return [ - ("prev_action", (self.action_dim,)), + ("prev_action", (self.action_dim, )), ] else: return [] diff --git a/rllab/tf/policies/categorical_mlp_policy.py b/rllab/tf/policies/categorical_mlp_policy.py index 40312bee4..8ca72de12 100644 --- a/rllab/tf/policies/categorical_mlp_policy.py +++ b/rllab/tf/policies/categorical_mlp_policy.py @@ -35,7 +35,7 @@ def __init__( with tf.variable_scope(name): if prob_network is None: prob_network = MLP( - input_shape=(env_spec.observation_space.flat_dim,), + input_shape=(env_spec.observation_space.flat_dim, ), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, @@ -47,8 +47,7 @@ def __init__( self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], - L.get_output(prob_network.output_layer) - ) + L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) @@ -61,7 +60,9 @@ def vectorized(self): @overrides def dist_info_sym(self, obs_var, state_info_vars=None): - return dict(prob=L.get_output(self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)})) + return dict( + prob=L.get_output(self._l_prob, + {self._l_obs: tf.cast(obs_var, tf.float32)})) @overrides def dist_info(self, obs, state_infos=None): diff --git a/rllab/tf/policies/deterministic_mlp_policy.py b/rllab/tf/policies/deterministic_mlp_policy.py index 3f674c447..e007aa570 100644 --- a/rllab/tf/policies/deterministic_mlp_policy.py +++ b/rllab/tf/policies/deterministic_mlp_policy.py @@ -15,21 +15,20 @@ class DeterministicMLPPolicy(Policy, LayersPowered, Serializable): - def __init__( - self, - env_spec, - name="DeterministicMLPPolicy", - hidden_sizes=(32, 32), - hidden_nonlinearity=tf.nn.relu, - output_nonlinearity=tf.nn.tanh, - prob_network=None, - bn=False): + def __init__(self, + env_spec, + name="DeterministicMLPPolicy", + hidden_sizes=(32, 32), + hidden_nonlinearity=tf.nn.relu, + output_nonlinearity=tf.nn.tanh, + prob_network=None, + bn=False): Serializable.quick_init(self, locals()) with tf.variable_scope(name): if prob_network is None: prob_network = MLP( - input_shape=(env_spec.observation_space.flat_dim,), + input_shape=(env_spec.observation_space.flat_dim, ), output_dim=env_spec.action_space.flat_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, @@ -42,8 +41,7 @@ def __init__( self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], - L.get_output(prob_network.output_layer, deterministic=True) - ) + L.get_output(prob_network.output_layer, deterministic=True)) self.prob_network = prob_network diff --git a/rllab/tf/policies/gaussian_gru_policy.py b/rllab/tf/policies/gaussian_gru_policy.py index fa2c9d70c..537f9b37a 100644 --- a/rllab/tf/policies/gaussian_gru_policy.py +++ b/rllab/tf/policies/gaussian_gru_policy.py @@ -178,20 +178,22 @@ def dist_info_sym(self, obs_var, state_info_vars, name="dist_info_sym"): obs_var = tf.reshape(obs_var, tf.stack([n_batches, n_steps, -1])) if self.state_include_action: prev_action_var = state_info_vars["prev_action"] - all_input_var = tf.concat(axis=2, values=[obs_var, prev_action_var]) + all_input_var = tf.concat( + axis=2, values=[obs_var, prev_action_var]) else: all_input_var = obs_var if self.feature_network is None: means, log_stds = L.get_output( [self.mean_network.output_layer, self.l_log_std], - {self.l_input: all_input_var} - ) + {self.l_input: all_input_var}) else: - flat_input_var = tf.reshape(all_input_var, (-1, self.input_dim)) + flat_input_var = tf.reshape(all_input_var, + (-1, self.input_dim)) means, log_stds = L.get_output( - [self.mean_network.output_layer, self.l_log_std], - {self.l_input: all_input_var, self.feature_network.input_layer: flat_input_var} - ) + [self.mean_network.output_layer, self.l_log_std], { + self.l_input: all_input_var, + self.feature_network.input_layer: flat_input_var + }) return dict(mean=means, log_std=log_stds) @property diff --git a/rllab/tf/policies/gaussian_lstm_policy.py b/rllab/tf/policies/gaussian_lstm_policy.py index 65a52f79b..b7540d7a9 100644 --- a/rllab/tf/policies/gaussian_lstm_policy.py +++ b/rllab/tf/policies/gaussian_lstm_policy.py @@ -187,20 +187,22 @@ def dist_info_sym(self, obs_var, state_info_vars, name="dist_info_sym"): obs_var = tf.reshape(obs_var, tf.stack([n_batches, n_steps, -1])) if self.state_include_action: prev_action_var = state_info_vars["prev_action"] - all_input_var = tf.concat(axis=2, values=[obs_var, prev_action_var]) + all_input_var = tf.concat( + axis=2, values=[obs_var, prev_action_var]) else: all_input_var = obs_var if self.feature_network is None: means, log_stds = L.get_output( [self.mean_network.output_layer, self.l_log_std], - {self.l_input: all_input_var} - ) + {self.l_input: all_input_var}) else: - flat_input_var = tf.reshape(all_input_var, (-1, self.input_dim)) + flat_input_var = tf.reshape(all_input_var, + (-1, self.input_dim)) means, log_stds = L.get_output( - [self.mean_network.output_layer, self.l_log_std], - {self.l_input: all_input_var, self.feature_network.input_layer: flat_input_var} - ) + [self.mean_network.output_layer, self.l_log_std], { + self.l_input: all_input_var, + self.feature_network.input_layer: flat_input_var + }) return dict(mean=means, log_std=log_stds) @property diff --git a/rllab/tf/policies/gaussian_mlp_policy.py b/rllab/tf/policies/gaussian_mlp_policy.py index e986865b1..86a6be3c1 100644 --- a/rllab/tf/policies/gaussian_mlp_policy.py +++ b/rllab/tf/policies/gaussian_mlp_policy.py @@ -14,25 +14,24 @@ from rllab.tf.misc.tensor_utils import enclosing_scope import tensorflow as tf + class GaussianMLPPolicy(StochasticPolicy, LayersPowered, Serializable): - def __init__( - self, - env_spec, - name="GaussianMLPPolicy", - hidden_sizes=(32, 32), - learn_std=True, - init_std=1.0, - adaptive_std=False, - std_share_network=False, - std_hidden_sizes=(32, 32), - min_std=1e-6, - std_hidden_nonlinearity=tf.nn.tanh, - hidden_nonlinearity=tf.nn.tanh, - output_nonlinearity=None, - mean_network=None, - std_network=None, - std_parametrization='exp' - ): + def __init__(self, + env_spec, + name="GaussianMLPPolicy", + hidden_sizes=(32, 32), + learn_std=True, + init_std=1.0, + adaptive_std=False, + std_share_network=False, + std_hidden_sizes=(32, 32), + min_std=1e-6, + std_hidden_nonlinearity=tf.nn.tanh, + hidden_nonlinearity=tf.nn.tanh, + output_nonlinearity=None, + mean_network=None, + std_network=None, + std_parametrization='exp'): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers @@ -174,9 +173,13 @@ def __init__( def vectorized(self): return True - def dist_info_sym(self, obs_var, state_info_vars=None, name="dist_info_sym"): + def dist_info_sym(self, + obs_var, + state_info_vars=None, + name="dist_info_sym"): with enclosing_scope(self.name, name): - mean_var, std_param_var = L.get_output([self._l_mean, self._l_std_param], obs_var) + mean_var, std_param_var = L.get_output( + [self._l_mean, self._l_std_param], obs_var) if self.min_std_param is not None: std_param_var = tf.maximum(std_param_var, self.min_std_param) if self.std_parametrization == 'exp': @@ -202,7 +205,11 @@ def get_actions(self, observations): actions = rnd * np.exp(log_stds) + means return actions, dict(mean=means, log_std=log_stds) - def get_reparam_action_sym(self, obs_var, action_var, old_dist_info_vars, name="get_reparam_action_sym"): + def get_reparam_action_sym(self, + obs_var, + action_var, + old_dist_info_vars, + name="get_reparam_action_sym"): """ Given observations, old actions, and distribution of old actions, return a symbolically reparameterized representation of the actions in terms of the policy parameters @@ -213,10 +220,14 @@ def get_reparam_action_sym(self, obs_var, action_var, old_dist_info_vars, name=" """ with enclosing_scope(self.name, name): new_dist_info_vars = self.dist_info_sym(obs_var, action_var) - new_mean_var, new_log_std_var = new_dist_info_vars["mean"], new_dist_info_vars["log_std"] - old_mean_var, old_log_std_var = old_dist_info_vars["mean"], old_dist_info_vars["log_std"] - epsilon_var = (action_var - old_mean_var) / (tf.exp(old_log_std_var) + 1e-8) - new_action_var = new_mean_var + epsilon_var * tf.exp(new_log_std_var) + new_mean_var, new_log_std_var = new_dist_info_vars[ + "mean"], new_dist_info_vars["log_std"] + old_mean_var, old_log_std_var = old_dist_info_vars[ + "mean"], old_dist_info_vars["log_std"] + epsilon_var = (action_var - old_mean_var) / ( + tf.exp(old_log_std_var) + 1e-8) + new_action_var = new_mean_var + epsilon_var * tf.exp( + new_log_std_var) return new_action_var def log_diagnostics(self, paths): diff --git a/rllab/tf/regressors/bernoulli_mlp_regressor.py b/rllab/tf/regressors/bernoulli_mlp_regressor.py index 3116fd195..f935c97ee 100644 --- a/rllab/tf/regressors/bernoulli_mlp_regressor.py +++ b/rllab/tf/regressors/bernoulli_mlp_regressor.py @@ -1,6 +1,3 @@ - - - import rllab.tf.core.layers as L import numpy as np import tensorflow as tf @@ -63,23 +60,31 @@ def __init__( hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.sigmoid, - name="p_network" - ) + name="p_network") l_p = p_network.output_layer LayersPowered.__init__(self, [l_p]) xs_var = p_network.input_layer.input_var - ys_var = tf.placeholder(dtype=tf.float32, shape=(None, output_dim), name="ys") - old_p_var = tf.placeholder(dtype=tf.float32, shape=(None, output_dim), name="old_p") - - x_mean_var = tf.get_variable(name="x_mean", initializer=tf.zeros_initializer(), shape=(1,) + input_shape) - x_std_var = tf.get_variable(name="x_std", initializer=tf.ones_initializer(), shape=(1,) + input_shape) + ys_var = tf.placeholder( + dtype=tf.float32, shape=(None, output_dim), name="ys") + old_p_var = tf.placeholder( + dtype=tf.float32, shape=(None, output_dim), name="old_p") + + x_mean_var = tf.get_variable( + name="x_mean", + initializer=tf.zeros_initializer(), + shape=(1, ) + input_shape) + x_std_var = tf.get_variable( + name="x_std", + initializer=tf.ones_initializer(), + shape=(1, ) + input_shape) normalized_xs_var = (xs_var - x_mean_var) / x_std_var - p_var = L.get_output(l_p, {p_network.input_layer: normalized_xs_var}) + p_var = L.get_output(l_p, + {p_network.input_layer: normalized_xs_var}) old_info_vars = dict(p=old_p_var) info_vars = dict(p=p_var) @@ -88,7 +93,7 @@ def __init__( mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) - loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) + loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = p_var >= 0.5 @@ -96,11 +101,17 @@ def __init__( self.f_p = tensor_utils.compile_function([xs_var], p_var) self.l_p = l_p - self.optimizer.update_opt(loss=loss, target=self, network_outputs=[p_var], inputs=[xs_var, ys_var]) - self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[p_var], - inputs=[xs_var, ys_var, old_p_var], - leq_constraint=(mean_kl, step_size) - ) + self.optimizer.update_opt( + loss=loss, + target=self, + network_outputs=[p_var], + inputs=[xs_var, ys_var]) + self.tr_optimizer.update_opt( + loss=loss, + target=self, + network_outputs=[p_var], + inputs=[xs_var, ys_var, old_p_var], + leq_constraint=(mean_kl, step_size)) self.use_trust_region = use_trust_region self.name = name @@ -115,10 +126,11 @@ def fit(self, xs, ys): # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 - tf.get_default_session().run(tf.group( - tf.assign(self.x_mean_var, new_mean), - tf.assign(self.x_std_var, new_std), - )) + tf.get_default_session().run( + tf.group( + tf.assign(self.x_mean_var, new_mean), + tf.assign(self.x_std_var, new_std), + )) # self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True)) # self._x_std_var.set_value(np.std(xs, axis=0, keepdims=True) + 1e-8) if self.use_trust_region and self.first_optimized: diff --git a/rllab/tf/regressors/categorical_mlp_regressor.py b/rllab/tf/regressors/categorical_mlp_regressor.py index 0f8a0e68d..808c2c2c5 100644 --- a/rllab/tf/regressors/categorical_mlp_regressor.py +++ b/rllab/tf/regressors/categorical_mlp_regressor.py @@ -1,6 +1,3 @@ - - - import numpy as np import tensorflow as tf @@ -68,31 +65,31 @@ def __init__( hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, - name="prob_network" - ) + name="prob_network") l_prob = prob_network.output_layer LayersPowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var - ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") - old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob") + ys_var = tf.placeholder( + dtype=tf.float32, shape=[None, output_dim], name="ys") + old_prob_var = tf.placeholder( + dtype=tf.float32, shape=[None, output_dim], name="old_prob") x_mean_var = tf.get_variable( name="x_mean", - shape=(1,) + input_shape, - initializer=tf.constant_initializer(0., dtype=tf.float32) - ) + shape=(1, ) + input_shape, + initializer=tf.constant_initializer(0., dtype=tf.float32)) x_std_var = tf.get_variable( name="x_std", - shape=(1,) + input_shape, - initializer=tf.constant_initializer(1., dtype=tf.float32) - ) + shape=(1, ) + input_shape, + initializer=tf.constant_initializer(1., dtype=tf.float32)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var - prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) + prob_var = L.get_output( + l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) @@ -101,20 +98,27 @@ def __init__( mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) - loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) + loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) - predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim) + predicted = tensor_utils.to_onehot_sym( + tf.argmax(prob_var, axis=1), output_dim) self.prob_network = prob_network self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_prob = tensor_utils.compile_function([xs_var], prob_var) self.l_prob = l_prob - self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var]) - self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], - inputs=[xs_var, ys_var, old_prob_var], - leq_constraint=(mean_kl, step_size) - ) + self.optimizer.update_opt( + loss=loss, + target=self, + network_outputs=[prob_var], + inputs=[xs_var, ys_var]) + self.tr_optimizer.update_opt( + loss=loss, + target=self, + network_outputs=[prob_var], + inputs=[xs_var, ys_var, old_prob_var], + leq_constraint=(mean_kl, step_size)) self.use_trust_region = use_trust_region self.name = name @@ -129,10 +133,11 @@ def fit(self, xs, ys): # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 - tf.get_default_session().run(tf.group( - tf.assign(self.x_mean_var, new_mean), - tf.assign(self.x_std_var, new_std), - )) + tf.get_default_session().run( + tf.group( + tf.assign(self.x_mean_var, new_mean), + tf.assign(self.x_std_var, new_std), + )) if self.use_trust_region and self.first_optimized: old_prob = self.f_prob(xs) inputs = [xs, ys, old_prob] @@ -161,12 +166,14 @@ def predict_log_likelihood(self, xs, ys): def dist_info_sym(self, x_var): normalized_xs_var = (x_var - self.x_mean_var) / self.x_std_var - prob = L.get_output(self.l_prob, {self.prob_network.input_layer: normalized_xs_var}) + prob = L.get_output(self.l_prob, + {self.prob_network.input_layer: normalized_xs_var}) return dict(prob=prob) def log_likelihood_sym(self, x_var, y_var): normalized_xs_var = (x_var - self.x_mean_var) / self.x_std_var - prob = L.get_output(self.l_prob, {self.prob_network.input_layer: normalized_xs_var}) + prob = L.get_output(self.l_prob, + {self.prob_network.input_layer: normalized_xs_var}) return self._dist.log_likelihood_sym(y_var, dict(prob=prob)) def get_param_values(self, **tags): diff --git a/rllab/tf/regressors/deterministic_mlp_regressor.py b/rllab/tf/regressors/deterministic_mlp_regressor.py index 0375e251c..9ac28c7bc 100644 --- a/rllab/tf/regressors/deterministic_mlp_regressor.py +++ b/rllab/tf/regressors/deterministic_mlp_regressor.py @@ -1,9 +1,3 @@ - - - - - - import numpy as np import tensorflow as tf @@ -62,34 +56,34 @@ def __init__( hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, - name="network" - ) + name="network") l_out = network.output_layer LayersPowered.__init__(self, [l_out]) xs_var = network.input_layer.input_var - ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") + ys_var = tf.placeholder( + dtype=tf.float32, shape=[None, output_dim], name="ys") x_mean_var = tf.get_variable( name="x_mean", - shape=(1,) + input_shape, - initializer=tf.constant_initializer(0., dtype=tf.float32) - ) + shape=(1, ) + input_shape, + initializer=tf.constant_initializer(0., dtype=tf.float32)) x_std_var = tf.get_variable( name="x_std", - shape=(1,) + input_shape, - initializer=tf.constant_initializer(1., dtype=tf.float32) - ) + shape=(1, ) + input_shape, + initializer=tf.constant_initializer(1., dtype=tf.float32)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var - fit_ys_var = L.get_output(l_out, {network.input_layer: normalized_xs_var}) + fit_ys_var = L.get_output(l_out, + {network.input_layer: normalized_xs_var}) - loss = - tf.reduce_mean(tf.square(fit_ys_var - ys_var)) + loss = -tf.reduce_mean(tf.square(fit_ys_var - ys_var)) - self.f_predict = tensor_utils.compile_function([xs_var], fit_ys_var) + self.f_predict = tensor_utils.compile_function([xs_var], + fit_ys_var) optimizer_args = dict( loss=loss, diff --git a/rllab/tf/regressors/gaussian_mlp_regressor.py b/rllab/tf/regressors/gaussian_mlp_regressor.py index 4f30f8806..ba68cb428 100644 --- a/rllab/tf/regressors/gaussian_mlp_regressor.py +++ b/rllab/tf/regressors/gaussian_mlp_regressor.py @@ -17,27 +17,25 @@ class GaussianMLPRegressor(LayersPowered, Serializable): A class for performing regression by fitting a Gaussian distribution to the outputs. """ - def __init__( - self, - input_shape, - output_dim, - name="GaussianMLPRegressor", - mean_network=None, - hidden_sizes=(32, 32), - hidden_nonlinearity=tf.nn.tanh, - optimizer=None, - use_trust_region=True, - step_size=0.01, - learn_std=True, - init_std=1.0, - adaptive_std=False, - std_share_network=False, - std_hidden_sizes=(32, 32), - std_nonlinearity=None, - normalize_inputs=True, - normalize_outputs=True, - subsample_factor=1.0 - ): + def __init__(self, + input_shape, + output_dim, + name="GaussianMLPRegressor", + mean_network=None, + hidden_sizes=(32, 32), + hidden_nonlinearity=tf.nn.tanh, + optimizer=None, + use_trust_region=True, + step_size=0.01, + learn_std=True, + init_std=1.0, + adaptive_std=False, + std_share_network=False, + std_hidden_sizes=(32, 32), + std_nonlinearity=None, + normalize_inputs=True, + normalize_outputs=True, + subsample_factor=1.0): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output.