In [1]:
import inspect

In [2]:
from flatland.env_generation.env_generator import env_generator
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.ml.observations.flatten_tree_observation_for_rail_env import FlattenedNormalizedTreeObsForRailEnv
from flatland.ml.pettingzoo.wrappers import PettingzooFlatland
from flatland.ml.pettingzoo.examples.flatland_pettingzoo_stable_baselines import train_flatland_pettingzoo_supersuit, eval_flatland_pettingzoo

# PettingZoo

PettingZoo (https://www.pettingzoo.ml/) is a collection of multi-agent environments for reinforcement learning. We build a pettingzoo interface for flatland.

### Background

PettingZoo is a popular multi-agent environment library (https://arxiv.org/abs/2009.14471) that aims to be the gym standard for Multi-Agent Reinforcement Learning. We list the below advantages that make it suitable for use with flatland

- Works with both rllib (https://docs.ray.io/en/latest/rllib.html) and stable baselines 3 (https://stable-baselines3.readthedocs.io/) using wrappers from Super Suit.
- Clean API (https://www.pettingzoo.ml/api) with additional facilities/api for parallel, saving observation, recording using gym monitor, processing, normalising observations
- Scikit-learn inspired api
  e.g.

```python
act = model.predict(obs, deterministic=True)[0] 
```

- Parallel learning using literally 2 lines of code to use with stable baselines 3

```python
env = ss.pettingzoo_env_to_vec_env_v0(env)
env = ss.concat_vec_envs_v0(env, 8, num_cpus=4, base_class=’stable_baselines3’)
```

- Tested and supports various multi-agent environments with many agents comparable to flatland. e.g. https://www.pettingzoo.ml/magent
- Clean interface means we can custom add an experimenting tool like wandb and have full flexibility to save information we want



## PettingZoo Demo

Uses Stable-Baselines3 to train agents to play the Flatland environment using SuperSuit vector envs.

For more information, see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html

Based on https://github.com/Farama-Foundation/PettingZoo/blob/master/tutorials/SB3/waterworld/sb3_waterworld_vector.py

#### Inspect Training Code

In [3]:
%pycat inspect.getsource(train_flatland_pettingzoo_supersuit)

[0;32mdef[0m [0mtrain_flatland_pettingzoo_supersuit[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0menv_fn[0m[0;34m,[0m [0msteps[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m10_000[0m[0;34m,[0m [0mseed[0m[0;34m:[0m [0mint[0m [0;34m|[0m [0;32mNone[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m [0;34m**[0m[0menv_kwargs[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;31m# Train a single model to play as each agent in a cooperative Parallel environment[0m[0;34m[0m
[0;34m[0m    [0menv[0m [0;34m=[0m [0menv_fn[0m[0;34m.[0m[0mparallel_env[0m[0;34m([0m[0;34m**[0m[0menv_kwargs[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m.[0m[0mreset[0m[0;34m([0m[0mseed[0m[0;34m=[0m[0mseed[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mprint[0m[0;34m([0m[0;34mf"[0m[0;34mStarting training on [0m[0;34m{[0m[0mstr[0m[0;34m([0m[0menv[0m[0;34m.[0m[0mmetadata[0m

#### Inspect Eval Code


In [4]:
%pycat inspect.getsource(eval_flatland_pettingzoo)

[0;32mdef[0m [0meval_flatland_pettingzoo[0m[0;34m([0m[0menv_fn[0m[0;34m,[0m [0mnum_games[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m100[0m[0;34m,[0m [0mrender_mode[0m[0;34m:[0m [0mstr[0m [0;34m|[0m [0;32mNone[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m [0;34m**[0m[0menv_kwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;31m# Evaluate a trained agent vs a random agent[0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m:[0m [0mParallelEnv[0m [0;34m=[0m [0menv_fn[0m[0;34m.[0m[0mparallel_env[0m[0;34m([0m[0mrender_mode[0m[0;34m=[0m[0mrender_mode[0m[0;34m,[0m [0;34m**[0m[0menv_kwargs[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mprint[0m[0;34m([0m[0;34m[0m
[0;34m[0m        [0;34mf"[0m[0;34m\nStarting evaluation on [0m[0;34m{[0m[0mstr[0m[0;34m([0m[0menv[0m[0;34m.[0m[0mmetadata[0m[0;34m[[0m[0;34m'name'[0m[0;34m][0m[0;34m)[0m[0;34m}[0m[0;34m (num_games=[0m[0;34m{[0m[0m

#### Train a model

In [5]:
raw_env, _, _ = env_generator(obs_builder_object=FlattenedNormalizedTreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv(max_depth=50)))
env_fn = PettingzooFlatland(raw_env)
env_kwargs = {}



In [6]:
train_flatland_pettingzoo_supersuit(env_fn, steps=196_608, seed=0, **env_kwargs)

Starting training on flatland_pettingzoo.
Using cpu device




-------------------------------
| time/              |        |
|    fps             | 1467   |
|    iterations      | 1      |
|    time_elapsed    | 78     |
|    total_timesteps | 114688 |
-------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1223        |
|    iterations           | 2           |
|    time_elapsed         | 187         |
|    total_timesteps      | 229376      |
| train/                  |             |
|    approx_kl            | 0.008171785 |
|    clip_fraction        | 0.0342      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.6        |
|    explained_variance   | -0.00243    |
|    learning_rate        | 0.001       |
|    loss                 | 447         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.000753   |
|    value_loss           | 957         |
-----------------------------------------
Model has been saved

#### Evaluate 10 games (average reward should be positive but can vary significantly)

In [8]:
eval_flatland_pettingzoo(env_fn, num_games=10, render_mode=None, **env_kwargs)


Starting evaluation on flatland_pettingzoo (num_games=10, render_mode=None)




Rewards:  {0: -1306, 1: -683, 2: -1485, 3: -955, 4: -1498, 5: -782, 6: -1194}
Avg reward: -1129.0


-1129.0

#### Watch 2 games

In [9]:
eval_flatland_pettingzoo(env_fn, num_games=2, render_mode="human", **env_kwargs)


Starting evaluation on flatland_pettingzoo (num_games=2, render_mode=human)
Rewards:  {0: -213, 1: -76, 2: -517, 3: -143, 4: -584, 5: -139, 6: -94}
Avg reward: -252.28571428571428


-252.28571428571428