Skip to content

Commit

Permalink
Directly using and serializing gym.spaces and their value (#116)
Browse files Browse the repository at this point in the history
* Directly using and serializing gym.spaces and their value

* Introducing debug inspector of received observation on the web side

* Take into account review
  • Loading branch information
cloderic committed Jan 24, 2023
1 parent bf8ebbc commit a1483cf
Show file tree
Hide file tree
Showing 74 changed files with 1,922 additions and 1,873 deletions.
57 changes: 31 additions & 26 deletions actors/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,17 @@
import numpy as np
import torch
from torch.distributions.normal import Normal
from gym.spaces import Box, utils

from cogment_verse import Model
from cogment_verse.run.run_session import RunSession
from cogment_verse.run.sample_producer_worker import SampleProducerSession
from cogment_verse.specs import (
PLAYER_ACTOR_CLASS,
AgentConfig,
cog_settings,
EnvironmentConfig,
EnvironmentSpecs,
PlayerAction,
cog_settings,
flatten,
flattened_dimensions,
unflatten,
PLAYER_ACTOR_CLASS,
)

torch.multiprocessing.set_sharing_strategy("file_system")
Expand Down Expand Up @@ -308,14 +305,15 @@ def get_actor_classes(self):
async def impl(self, actor_session):
# Start a session
actor_session.start()

config = actor_session.config
assert config.environment_specs.num_players == 1
assert len(config.environment_specs.action_space.properties) == 1
assert config.environment_specs.action_space.properties[0].WhichOneof("type") == "box"

# Get observation and action space
observation_space = config.environment_specs.observation_space
action_space = config.environment_specs.action_space
environment_specs = EnvironmentSpecs.deserialize(config.environment_specs)
observation_space = environment_specs.get_observation_space()
action_space = environment_specs.get_action_space()

assert isinstance(action_space.gym_space, Box)
assert config.environment_specs.num_players == 1

# Get model
model, _, _ = await actor_session.model_registry.retrieve_version(
Expand All @@ -324,9 +322,9 @@ async def impl(self, actor_session):

async for event in actor_session.all_events():
if event.observation and event.type == cogment.EventType.ACTIVE:
obs_tensor = torch.tensor(
flatten(observation_space, event.observation.observation.value), dtype=self._dtype
).view(1, -1)
observation = observation_space.deserialize(event.observation.observation)

obs_tensor = torch.tensor(observation.flat_value, dtype=self._dtype).view(1, -1)

# Normalize the observation
if model.state_normalization is not None:
Expand All @@ -339,11 +337,11 @@ async def impl(self, actor_session):
# Get action from policy network
with torch.no_grad():
dist, _ = model.policy_network(obs_tensor)
action = dist.sample().cpu().numpy()[0]
action_value = dist.sample().cpu().numpy()[0]

# Send action to environment
action_value = unflatten(action_space, action)
actor_session.do_action(PlayerAction(value=action_value))
action = action_space.create(value=action_value)
actor_session.do_action(action_space.serialize(action))


class PPOTraining:
Expand Down Expand Up @@ -392,8 +390,8 @@ def __init__(self, environment_specs: EnvironmentSpecs, cfg: EnvironmentConfig)
self.model = PPOModel(
model_id="",
environment_implementation=self._environment_specs.implementation,
num_input=flattened_dimensions(self._environment_specs.observation_space),
num_output=flattened_dimensions(self._environment_specs.action_space),
num_input=utils.flatdim(self._environment_specs.get_observation_space().gym_space),
num_output=utils.flatdim(self._environment_specs.get_action_space().gym_space),
learning_rate=self._cfg.learning_rate,
n_iter=self._cfg.num_epochs,
policy_network_hidden_nodes=self._cfg.policy_network.num_hidden_nodes,
Expand All @@ -404,15 +402,20 @@ def __init__(self, environment_specs: EnvironmentSpecs, cfg: EnvironmentConfig)

async def trial_sample_sequences_producer_impl(self, sample_producer_session: SampleProducerSession):
"""Collect sample from the trial"""

# Share with A2C

observation = []
action = []
reward = []
done = []

player_actor_params = sample_producer_session.trial_info.parameters.actors[0]

player_actor_name = player_actor_params.name
player_observation_space = player_actor_params.config.environment_specs.observation_space
player_action_space = player_actor_params.config.environment_specs.action_space
player_environment_specs = EnvironmentSpecs.deserialize(player_actor_params.config.environment_specs)
player_observation_space = player_environment_specs.get_observation_space()
player_action_space = player_environment_specs.get_action_space()

async for sample in sample_producer_session.all_trial_samples():
if sample.trial_state == cogment.TrialState.ENDED:
Expand All @@ -423,9 +426,10 @@ async def trial_sample_sequences_producer_impl(self, sample_producer_session: Sa

actor_sample = sample.actors_data[player_actor_name]
observation.append(
torch.tensor(flatten(player_observation_space, actor_sample.observation.value), dtype=self._dtype)
torch.tensor(player_observation_space.deserialize(actor_sample.observation).value, dtype=self._dtype)
)
action.append(torch.tensor(flatten(player_action_space, actor_sample.action.value), dtype=self._dtype))

action.append(torch.tensor(player_action_space.deserialize(actor_sample.action).value, dtype=self._dtype))
reward.append(
torch.tensor(actor_sample.reward if actor_sample.reward is not None else 0, dtype=self._dtype)
)
Expand All @@ -438,8 +442,9 @@ async def impl(self, run_session: RunSession) -> dict:
"""Train and publish model the model"""

model_id = f"{run_session.run_id}_model"

assert self._environment_specs.num_players == 1
assert len(self._environment_specs.action_space.properties) == 1
assert isinstance(self._environment_specs.get_action_space().gym_space, Box)

# Initalize model
self.model.model_id = model_id
Expand All @@ -462,7 +467,7 @@ def create_trial_params(trial_idx: int, iter_idx: int):
implementation="actors.ppo.PPOActor",
config=AgentConfig(
run_id=run_session.run_id,
environment_specs=self._environment_specs,
environment_specs=self._environment_specs.serialize(),
model_id=model_id,
model_version=version_info["version_number"],
),
Expand Down
27 changes: 11 additions & 16 deletions actors/random_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,8 @@
# limitations under the License.

import cogment
import numpy as np

from cogment_verse.specs import (
PLAYER_ACTOR_CLASS,
PlayerAction,
sample_space,
)
from cogment_verse.specs import PLAYER_ACTOR_CLASS, EnvironmentSpecs


class RandomActor:
Expand All @@ -33,19 +28,19 @@ async def impl(self, actor_session):
actor_session.start()

config = actor_session.config
environment_specs = EnvironmentSpecs.deserialize(config.environment_specs)
observation_space = environment_specs.get_observation_space()
action_space = environment_specs.get_action_space()

action_space = config.environment_specs.action_space

rng = np.random.default_rng(config.seed if config.seed is not None else 0)
action_space.gym_space.seed(config.seed if config.seed is not None else 0)

async for event in actor_session.all_events():
if event.observation and event.type == cogment.EventType.ACTIVE:
if (
event.observation.observation.HasField("current_player")
and event.observation.observation.current_player != actor_session.name
):
observation = observation_space.deserialize(event.observation.observation)
if observation.current_player is not None and observation.current_player != actor_session.name:
# Not the turn of the agent
actor_session.do_action(PlayerAction())
actor_session.do_action(action_space.serialize(action_space.create()))
continue
[action_value] = sample_space(action_space, rng=rng, mask=event.observation.observation.action_mask)
actor_session.do_action(PlayerAction(value=action_value))

action = action_space.sample(mask=observation.action_mask)
actor_session.do_action(action_space.serialize(action))
69 changes: 26 additions & 43 deletions actors/simple_a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint: disable=C0303
# pylint: disable=W0611
# pylint: disable=W0612

import logging

import cogment
import torch

from gym.spaces import utils, Discrete

from cogment_verse import Model
from cogment_verse.specs import (
PLAYER_ACTOR_CLASS,
AgentConfig,
EnvironmentConfig,
PlayerAction,
SpaceValue,
cog_settings,
flatten,
flattened_dimensions,
unflatten,
EnvironmentConfig,
EnvironmentSpecs,
)
from cogment_verse.constants import PLAYER_ACTOR_CLASS

torch.multiprocessing.set_sharing_strategy("file_system")

Expand Down Expand Up @@ -132,12 +126,9 @@ async def impl(self, actor_session):

config = actor_session.config

assert config.environment_specs.num_players == 1
assert len(config.environment_specs.action_space.properties) == 1
# assert config.environment_specs.action_space.properties[0].WhichOneof("type") == "discrete"

observation_space = config.environment_specs.observation_space
action_space = config.environment_specs.action_space
environment_specs = EnvironmentSpecs.deserialize(config.environment_specs)
observation_space = environment_specs.get_observation_space()
action_space = environment_specs.get_action_space(seed=config.seed)

model, _, _ = await actor_session.model_registry.retrieve_version(
SimpleA2CModel, config.model_id, config.model_version
Expand All @@ -147,22 +138,17 @@ async def impl(self, actor_session):

async for event in actor_session.all_events():
if event.observation and event.type == cogment.EventType.ACTIVE:
obs_tensor = torch.tensor(
flatten(observation_space, event.observation.observation.value), dtype=self._dtype
)
if config.environment_specs.action_space.properties[0].WhichOneof("type") == "discrete":
probs = torch.softmax(model.actor_network(obs_tensor), dim=-1)
discrete_action_tensor = torch.distributions.Categorical(probs).sample()
action_value = SpaceValue(
properties=[SpaceValue.PropertyValue(discrete=discrete_action_tensor.item())]
)
observation = observation_space.deserialize(event.observation.observation)

if isinstance(action_space.gym_space, Discrete):
observation_tensor = torch.tensor(observation.flat_value, dtype=self._dtype)
probs = torch.softmax(model.actor_network(observation_tensor), dim=-1)
discrete_action_tensor = torch.distributions.Categorical(probs).sample()
action = action_space.create(value=discrete_action_tensor.numpy())
else:
action = torch.rand((1,) + (action_space.properties[0].box.shape[0],))
action = action.cpu().numpy()[0]
action_value = unflatten(action_space, action)
action = action_space.sample()

actor_session.do_action(PlayerAction(value=action_value))
actor_session.do_action(action_space.serialize(action))


class SimpleA2CTraining:
Expand Down Expand Up @@ -195,7 +181,9 @@ async def trial_sample_sequences_producer_impl(self, sample_producer_session):
player_actor_params = sample_producer_session.trial_info.parameters.actors[0]

player_actor_name = player_actor_params.name
player_observation_space = player_actor_params.config.environment_specs.observation_space
player_environment_specs = EnvironmentSpecs.deserialize(player_actor_params.config.environment_specs)
player_observation_space = player_environment_specs.get_observation_space()
player_action_space = player_environment_specs.get_action_space()

async for sample in sample_producer_session.all_trial_samples():
if sample.trial_state == cogment.TrialState.ENDED:
Expand All @@ -206,14 +194,10 @@ async def trial_sample_sequences_producer_impl(self, sample_producer_session):

actor_sample = sample.actors_data[player_actor_name]
observation.append(
torch.tensor(flatten(player_observation_space, actor_sample.observation.value), dtype=self._dtype)
)
action_value = actor_sample.action.value
action.append(
torch.tensor(
action_value.properties[0].discrete if len(action_value.properties) > 0 else 0, dtype=self._dtype
)
torch.tensor(player_observation_space.deserialize(actor_sample.observation).value, dtype=self._dtype)
)

action.append(torch.tensor(player_action_space.deserialize(actor_sample.action).value, dtype=self._dtype))
reward.append(
torch.tensor(actor_sample.reward if actor_sample.reward is not None else 0, dtype=self._dtype)
)
Expand All @@ -227,14 +211,13 @@ async def impl(self, run_session):
model_id = f"{run_session.run_id}_model"

assert self._environment_specs.num_players == 1
assert len(self._environment_specs.action_space.properties) == 1
# assert self._environment_specs.action_space.properties[0].WhichOneof("type") == "discrete"
assert isinstance(self._environment_specs.get_action_space().gym_space, Discrete)

model = SimpleA2CModel(
model_id,
environment_implementation=self._environment_specs.implementation,
num_input=flattened_dimensions(self._environment_specs.observation_space),
num_output=flattened_dimensions(self._environment_specs.action_space),
num_input=utils.flatdim(self._environment_specs.get_observation_space().gym_space),
num_output=utils.flatdim(self._environment_specs.get_action_space().gym_space),
actor_network_num_hidden_nodes=self._cfg.actor_network.num_hidden_nodes,
critic_network_num_hidden_nodes=self._cfg.critic_network.num_hidden_nodes,
dtype=self._dtype,
Expand Down Expand Up @@ -285,7 +268,7 @@ async def impl(self, run_session):
run_id=run_session.run_id,
model_id=model_id,
model_version=version_info["version_number"],
environment_specs=self._environment_specs,
environment_specs=self._environment_specs.serialize(),
),
)
],
Expand Down
Loading

0 comments on commit a1483cf

Please sign in to comment.