Directly using and serializing gym.spaces and their value (#116)

* Directly using and serializing gym.spaces and their value * Introducing debug inspector of received observation on the web side * Take into account review
cogment · Jan 24, 2023 · a1483cf · a1483cf
1 parent bf8ebbc
commit a1483cf
Show file tree

Hide file tree

Showing 74 changed files with 1,922 additions and 1,873 deletions.
diff --git a/actors/ppo.py b/actors/ppo.py
@@ -20,20 +20,17 @@
 import numpy as np
 import torch
 from torch.distributions.normal import Normal
+from gym.spaces import Box, utils
 
 from cogment_verse import Model
 from cogment_verse.run.run_session import RunSession
 from cogment_verse.run.sample_producer_worker import SampleProducerSession
 from cogment_verse.specs import (
-    PLAYER_ACTOR_CLASS,
     AgentConfig,
+    cog_settings,
     EnvironmentConfig,
     EnvironmentSpecs,
-    PlayerAction,
-    cog_settings,
-    flatten,
-    flattened_dimensions,
-    unflatten,
+    PLAYER_ACTOR_CLASS,
 )
 
 torch.multiprocessing.set_sharing_strategy("file_system")
@@ -308,14 +305,15 @@ def get_actor_classes(self):
     async def impl(self, actor_session):
         # Start a session
         actor_session.start()
+
         config = actor_session.config
-        assert config.environment_specs.num_players == 1
-        assert len(config.environment_specs.action_space.properties) == 1
-        assert config.environment_specs.action_space.properties[0].WhichOneof("type") == "box"
 
-        # Get observation and action space
-        observation_space = config.environment_specs.observation_space
-        action_space = config.environment_specs.action_space
+        environment_specs = EnvironmentSpecs.deserialize(config.environment_specs)
+        observation_space = environment_specs.get_observation_space()
+        action_space = environment_specs.get_action_space()
+
+        assert isinstance(action_space.gym_space, Box)
+        assert config.environment_specs.num_players == 1
 
         # Get model
         model, _, _ = await actor_session.model_registry.retrieve_version(
@@ -324,9 +322,9 @@ async def impl(self, actor_session):
 
         async for event in actor_session.all_events():
             if event.observation and event.type == cogment.EventType.ACTIVE:
-                obs_tensor = torch.tensor(
-                    flatten(observation_space, event.observation.observation.value), dtype=self._dtype
-                ).view(1, -1)
+                observation = observation_space.deserialize(event.observation.observation)
+
+                obs_tensor = torch.tensor(observation.flat_value, dtype=self._dtype).view(1, -1)
 
                 # Normalize the observation
                 if model.state_normalization is not None:
@@ -339,11 +337,11 @@ async def impl(self, actor_session):
                 # Get action from policy network
                 with torch.no_grad():
                     dist, _ = model.policy_network(obs_tensor)
-                    action = dist.sample().cpu().numpy()[0]
+                    action_value = dist.sample().cpu().numpy()[0]
 
                 # Send action to environment
-                action_value = unflatten(action_space, action)
-                actor_session.do_action(PlayerAction(value=action_value))
+                action = action_space.create(value=action_value)
+                actor_session.do_action(action_space.serialize(action))
 
 
 class PPOTraining:
@@ -392,8 +390,8 @@ def __init__(self, environment_specs: EnvironmentSpecs, cfg: EnvironmentConfig)
         self.model = PPOModel(
             model_id="",
             environment_implementation=self._environment_specs.implementation,
-            num_input=flattened_dimensions(self._environment_specs.observation_space),
-            num_output=flattened_dimensions(self._environment_specs.action_space),
+            num_input=utils.flatdim(self._environment_specs.get_observation_space().gym_space),
+            num_output=utils.flatdim(self._environment_specs.get_action_space().gym_space),
             learning_rate=self._cfg.learning_rate,
             n_iter=self._cfg.num_epochs,
             policy_network_hidden_nodes=self._cfg.policy_network.num_hidden_nodes,
@@ -404,15 +402,20 @@ def __init__(self, environment_specs: EnvironmentSpecs, cfg: EnvironmentConfig)
 
     async def trial_sample_sequences_producer_impl(self, sample_producer_session: SampleProducerSession):
         """Collect sample from the trial"""
+
+        # Share with A2C
+
         observation = []
         action = []
         reward = []
         done = []
 
         player_actor_params = sample_producer_session.trial_info.parameters.actors[0]
+
         player_actor_name = player_actor_params.name
-        player_observation_space = player_actor_params.config.environment_specs.observation_space
-        player_action_space = player_actor_params.config.environment_specs.action_space
+        player_environment_specs = EnvironmentSpecs.deserialize(player_actor_params.config.environment_specs)
+        player_observation_space = player_environment_specs.get_observation_space()
+        player_action_space = player_environment_specs.get_action_space()
 
         async for sample in sample_producer_session.all_trial_samples():
             if sample.trial_state == cogment.TrialState.ENDED:
@@ -423,9 +426,10 @@ async def trial_sample_sequences_producer_impl(self, sample_producer_session: Sa
 
             actor_sample = sample.actors_data[player_actor_name]
             observation.append(
-                torch.tensor(flatten(player_observation_space, actor_sample.observation.value), dtype=self._dtype)
+                torch.tensor(player_observation_space.deserialize(actor_sample.observation).value, dtype=self._dtype)
             )
-            action.append(torch.tensor(flatten(player_action_space, actor_sample.action.value), dtype=self._dtype))
+
+            action.append(torch.tensor(player_action_space.deserialize(actor_sample.action).value, dtype=self._dtype))
             reward.append(
                 torch.tensor(actor_sample.reward if actor_sample.reward is not None else 0, dtype=self._dtype)
             )
@@ -438,8 +442,9 @@ async def impl(self, run_session: RunSession) -> dict:
         """Train and publish model the model"""
 
         model_id = f"{run_session.run_id}_model"
+
         assert self._environment_specs.num_players == 1
-        assert len(self._environment_specs.action_space.properties) == 1
+        assert isinstance(self._environment_specs.get_action_space().gym_space, Box)
 
         # Initalize model
         self.model.model_id = model_id
@@ -462,7 +467,7 @@ def create_trial_params(trial_idx: int, iter_idx: int):
                 implementation="actors.ppo.PPOActor",
                 config=AgentConfig(
                     run_id=run_session.run_id,
-                    environment_specs=self._environment_specs,
+                    environment_specs=self._environment_specs.serialize(),
                     model_id=model_id,
                     model_version=version_info["version_number"],
                 ),

diff --git a/actors/random_actor.py b/actors/random_actor.py
@@ -13,13 +13,8 @@
 # limitations under the License.
 
 import cogment
-import numpy as np
 
-from cogment_verse.specs import (
-    PLAYER_ACTOR_CLASS,
-    PlayerAction,
-    sample_space,
-)
+from cogment_verse.specs import PLAYER_ACTOR_CLASS, EnvironmentSpecs
 
 
 class RandomActor:
@@ -33,19 +28,19 @@ async def impl(self, actor_session):
         actor_session.start()
 
         config = actor_session.config
+        environment_specs = EnvironmentSpecs.deserialize(config.environment_specs)
+        observation_space = environment_specs.get_observation_space()
+        action_space = environment_specs.get_action_space()
 
-        action_space = config.environment_specs.action_space
-
-        rng = np.random.default_rng(config.seed if config.seed is not None else 0)
+        action_space.gym_space.seed(config.seed if config.seed is not None else 0)
 
         async for event in actor_session.all_events():
             if event.observation and event.type == cogment.EventType.ACTIVE:
-                if (
-                    event.observation.observation.HasField("current_player")
-                    and event.observation.observation.current_player != actor_session.name
-                ):
+                observation = observation_space.deserialize(event.observation.observation)
+                if observation.current_player is not None and observation.current_player != actor_session.name:
                     # Not the turn of the agent
-                    actor_session.do_action(PlayerAction())
+                    actor_session.do_action(action_space.serialize(action_space.create()))
                     continue
-                [action_value] = sample_space(action_space, rng=rng, mask=event.observation.observation.action_mask)
-                actor_session.do_action(PlayerAction(value=action_value))
+
+                action = action_space.sample(mask=observation.action_mask)
+                actor_session.do_action(action_space.serialize(action))
diff --git a/actors/simple_a2c.py b/actors/simple_a2c.py
@@ -12,27 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# pylint: disable=C0303
-# pylint: disable=W0611
-# pylint: disable=W0612
-
 import logging
 
 import cogment
 import torch
 
+from gym.spaces import utils, Discrete
+
 from cogment_verse import Model
 from cogment_verse.specs import (
-    PLAYER_ACTOR_CLASS,
     AgentConfig,
-    EnvironmentConfig,
-    PlayerAction,
-    SpaceValue,
     cog_settings,
-    flatten,
-    flattened_dimensions,
-    unflatten,
+    EnvironmentConfig,
+    EnvironmentSpecs,
 )
+from cogment_verse.constants import PLAYER_ACTOR_CLASS
 
 torch.multiprocessing.set_sharing_strategy("file_system")
 
@@ -132,12 +126,9 @@ async def impl(self, actor_session):
 
         config = actor_session.config
 
-        assert config.environment_specs.num_players == 1
-        assert len(config.environment_specs.action_space.properties) == 1
-        # assert config.environment_specs.action_space.properties[0].WhichOneof("type") == "discrete"
-
-        observation_space = config.environment_specs.observation_space
-        action_space = config.environment_specs.action_space
+        environment_specs = EnvironmentSpecs.deserialize(config.environment_specs)
+        observation_space = environment_specs.get_observation_space()
+        action_space = environment_specs.get_action_space(seed=config.seed)
 
         model, _, _ = await actor_session.model_registry.retrieve_version(
             SimpleA2CModel, config.model_id, config.model_version
@@ -147,22 +138,17 @@ async def impl(self, actor_session):
 
         async for event in actor_session.all_events():
             if event.observation and event.type == cogment.EventType.ACTIVE:
-                obs_tensor = torch.tensor(
-                    flatten(observation_space, event.observation.observation.value), dtype=self._dtype
-                )
-                if config.environment_specs.action_space.properties[0].WhichOneof("type") == "discrete":
-                    probs = torch.softmax(model.actor_network(obs_tensor), dim=-1)
-                    discrete_action_tensor = torch.distributions.Categorical(probs).sample()
-                    action_value = SpaceValue(
-                        properties=[SpaceValue.PropertyValue(discrete=discrete_action_tensor.item())]
-                    )
+                observation = observation_space.deserialize(event.observation.observation)
 
+                if isinstance(action_space.gym_space, Discrete):
+                    observation_tensor = torch.tensor(observation.flat_value, dtype=self._dtype)
+                    probs = torch.softmax(model.actor_network(observation_tensor), dim=-1)
+                    discrete_action_tensor = torch.distributions.Categorical(probs).sample()
+                    action = action_space.create(value=discrete_action_tensor.numpy())
                 else:
-                    action = torch.rand((1,) + (action_space.properties[0].box.shape[0],))
-                    action = action.cpu().numpy()[0]
-                    action_value = unflatten(action_space, action)
+                    action = action_space.sample()
 
-                actor_session.do_action(PlayerAction(value=action_value))
+                actor_session.do_action(action_space.serialize(action))
 
 
 class SimpleA2CTraining:
@@ -195,7 +181,9 @@ async def trial_sample_sequences_producer_impl(self, sample_producer_session):
         player_actor_params = sample_producer_session.trial_info.parameters.actors[0]
 
         player_actor_name = player_actor_params.name
-        player_observation_space = player_actor_params.config.environment_specs.observation_space
+        player_environment_specs = EnvironmentSpecs.deserialize(player_actor_params.config.environment_specs)
+        player_observation_space = player_environment_specs.get_observation_space()
+        player_action_space = player_environment_specs.get_action_space()
 
         async for sample in sample_producer_session.all_trial_samples():
             if sample.trial_state == cogment.TrialState.ENDED:
@@ -206,14 +194,10 @@ async def trial_sample_sequences_producer_impl(self, sample_producer_session):
 
             actor_sample = sample.actors_data[player_actor_name]
             observation.append(
-                torch.tensor(flatten(player_observation_space, actor_sample.observation.value), dtype=self._dtype)
-            )
-            action_value = actor_sample.action.value
-            action.append(
-                torch.tensor(
-                    action_value.properties[0].discrete if len(action_value.properties) > 0 else 0, dtype=self._dtype
-                )
+                torch.tensor(player_observation_space.deserialize(actor_sample.observation).value, dtype=self._dtype)
             )
+
+            action.append(torch.tensor(player_action_space.deserialize(actor_sample.action).value, dtype=self._dtype))
             reward.append(
                 torch.tensor(actor_sample.reward if actor_sample.reward is not None else 0, dtype=self._dtype)
             )
@@ -227,14 +211,13 @@ async def impl(self, run_session):
         model_id = f"{run_session.run_id}_model"
 
         assert self._environment_specs.num_players == 1
-        assert len(self._environment_specs.action_space.properties) == 1
-        # assert self._environment_specs.action_space.properties[0].WhichOneof("type") == "discrete"
+        assert isinstance(self._environment_specs.get_action_space().gym_space, Discrete)
 
         model = SimpleA2CModel(
             model_id,
             environment_implementation=self._environment_specs.implementation,
-            num_input=flattened_dimensions(self._environment_specs.observation_space),
-            num_output=flattened_dimensions(self._environment_specs.action_space),
+            num_input=utils.flatdim(self._environment_specs.get_observation_space().gym_space),
+            num_output=utils.flatdim(self._environment_specs.get_action_space().gym_space),
             actor_network_num_hidden_nodes=self._cfg.actor_network.num_hidden_nodes,
             critic_network_num_hidden_nodes=self._cfg.critic_network.num_hidden_nodes,
             dtype=self._dtype,
@@ -285,7 +268,7 @@ async def impl(self, run_session):
                                         run_id=run_session.run_id,
                                         model_id=model_id,
                                         model_version=version_info["version_number"],
-                                        environment_specs=self._environment_specs,
+                                        environment_specs=self._environment_specs.serialize(),
                                     ),
                                 )
                             ],