Skip to content

PPO

ppo

Figure: PPO algorithm pseudocode 1

toyrl.ppo.default_config module-attribute

default_config = PPOConfig(env_name='CartPole-v1', render_mode=None, solved_threshold=475.0, gamma=0.99, lambda_=0.95, epsilon=0.2, entropy_coef=0.01, total_timesteps=1000000, time_horizons=256, update_epochs=4, num_minibatches=4, learning_rate=0.00025, log_wandb=True)

toyrl.ppo.trainer module-attribute

toyrl.ppo.PPOConfig dataclass

PPOConfig(env_name: str = 'CartPole-v1', num_envs: int = 4, render_mode: str | None = None, solved_threshold: float = 475.0, gamma: float = 0.999, lambda_: float = 0.98, epsilon: float = 0.2, entropy_coef: float = 0.01, total_timesteps: int = 500000, time_horizons: int = 128, update_epochs: int = 4, num_minibatches: int = 4, learning_rate: float = 0.00025, anneal_learning_rate: bool = True, log_wandb: bool = False)

Configuration for PPO algorithm.

env_name class-attribute instance-attribute

env_name: str = 'CartPole-v1'

num_envs class-attribute instance-attribute

num_envs: int = 4

The number of parallel game environments

render_mode class-attribute instance-attribute

render_mode: str | None = None

solved_threshold class-attribute instance-attribute

solved_threshold: float = 475.0

gamma class-attribute instance-attribute

gamma: float = 0.999

lambda_ class-attribute instance-attribute

lambda_: float = 0.98

epsilon class-attribute instance-attribute

epsilon: float = 0.2

entropy_coef class-attribute instance-attribute

entropy_coef: float = 0.01

total_timesteps class-attribute instance-attribute

total_timesteps: int = 500000

time_horizons class-attribute instance-attribute

time_horizons: int = 128

The number of time steps to collect before updating the policy

update_epochs class-attribute instance-attribute

update_epochs: int = 4

The K epochs to update the policy

num_minibatches class-attribute instance-attribute

num_minibatches: int = 4

The number of mini-batches

learning_rate class-attribute instance-attribute

learning_rate: float = 0.00025

anneal_learning_rate class-attribute instance-attribute

anneal_learning_rate: bool = True

log_wandb class-attribute instance-attribute

log_wandb: bool = False

toyrl.ppo.ActorPolicyNet

ActorPolicyNet(env_dim: int, action_num: int)

Bases: Module

Source code in toyrl/ppo.py
41
42
43
44
45
46
47
48
49
50
def __init__(self, env_dim: int, action_num: int) -> None:
    super().__init__()
    layers = [
        nn.Linear(env_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, action_num),
    ]
    self.model = nn.Sequential(*layers)

model instance-attribute

model = Sequential(*layers)

forward

forward(x: Tensor) -> Tensor
Source code in toyrl/ppo.py
52
53
def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.model(x)

toyrl.ppo.CriticValueNet

CriticValueNet(env_dim: int)

Bases: Module

Source code in toyrl/ppo.py
57
58
59
60
61
62
63
64
65
66
def __init__(self, env_dim: int) -> None:
    super().__init__()
    layers = [
        nn.Linear(env_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, 1),
    ]
    self.model = nn.Sequential(*layers)

model instance-attribute

model = Sequential(*layers)

forward

forward(x: Tensor) -> Tensor
Source code in toyrl/ppo.py
68
69
def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.model(x)

toyrl.ppo.Experience dataclass

Experience(env_id: int, terminated: bool, truncated: bool, observation: Any, reward: float, next_observation: Any, action: Any, action_logprob: float, advantage: float | None = None, target_value: float | None = None)

env_id instance-attribute

env_id: int

terminated instance-attribute

terminated: bool

truncated instance-attribute

truncated: bool

observation instance-attribute

observation: Any

reward instance-attribute

reward: float

next_observation instance-attribute

next_observation: Any

action instance-attribute

action: Any

action_logprob instance-attribute

action_logprob: float

advantage class-attribute instance-attribute

advantage: float | None = None

target_value class-attribute instance-attribute

target_value: float | None = None

toyrl.ppo.ReplayBuffer dataclass

ReplayBuffer(buffer: list[Experience] = list(), env_ids: set[int] = set())

buffer class-attribute instance-attribute

buffer: list[Experience] = field(default_factory=list)

env_ids class-attribute instance-attribute

env_ids: set[int] = field(default_factory=set)

__len__

__len__() -> int
Source code in toyrl/ppo.py
92
93
def __len__(self) -> int:
    return len(self.buffer)

add_experience

add_experience(experience: Experience) -> None
Source code in toyrl/ppo.py
95
96
97
def add_experience(self, experience: Experience) -> None:
    self.buffer.append(experience)
    self.env_ids.add(experience.env_id)

reset

reset() -> None
Source code in toyrl/ppo.py
 99
100
101
def reset(self) -> None:
    self.buffer = []
    self.env_ids = set()

sample

sample() -> list[Experience]
Source code in toyrl/ppo.py
103
104
def sample(self) -> list[Experience]:
    return self.buffer

toyrl.ppo.PPOAgent

PPOAgent(actor: ActorPolicyNet, critic: CriticValueNet, optimizer: Optimizer)
Source code in toyrl/ppo.py
108
109
110
111
112
def __init__(self, actor: ActorPolicyNet, critic: CriticValueNet, optimizer: optim.Optimizer) -> None:
    self.actor = actor
    self.critic = critic
    self.optimizer = optimizer
    self._replay_buffer = ReplayBuffer()

actor instance-attribute

actor = actor

critic instance-attribute

critic = critic

optimizer instance-attribute

optimizer = optimizer

add_experience

add_experience(experience: Experience) -> None
Source code in toyrl/ppo.py
114
115
def add_experience(self, experience: Experience) -> None:
    self._replay_buffer.add_experience(experience)

reset

reset() -> None
Source code in toyrl/ppo.py
117
118
def reset(self) -> None:
    self._replay_buffer.reset()

act

act(observation: Any) -> tuple[int, float]
Source code in toyrl/ppo.py
120
121
122
123
124
125
126
127
def act(self, observation: Any) -> tuple[int, float]:
    x = torch.from_numpy(observation.astype(np.float32))
    with torch.no_grad():
        logits = self.actor(x)
    probs = torch.nn.functional.softmax(logits, dim=-1)
    action = torch.distributions.Categorical(probs).sample()
    action_logprob = torch.log(probs[action])
    return action.item(), action_logprob.item()

net_update

net_update(num_minibatches: int, gamma: float, lambda_: float, epsilon: float, entropy_coef: float) -> float
Source code in toyrl/ppo.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def net_update(
    self,
    num_minibatches: int,
    gamma: float,
    lambda_: float,
    epsilon: float,
    entropy_coef: float,
) -> float:
    raw_experiences = self._replay_buffer.sample()
    # calculate advantages and target values by GAE
    experiences = self._calc_adv_v_target(raw_experiences, gamma, lambda_)
    minibatch_size = len(experiences) // num_minibatches
    total_loss = 0.0
    for i in range(num_minibatches):
        batch_experiences = experiences[minibatch_size * i : minibatch_size * (i + 1)]
        observations = torch.tensor(np.array([exp.observation for exp in batch_experiences]), dtype=torch.float32)
        actions = torch.tensor(np.array([exp.action for exp in batch_experiences]), dtype=torch.int64)
        old_action_logprobs = torch.tensor(
            np.array([exp.action_logprob for exp in batch_experiences]), dtype=torch.float32
        )
        advantages = torch.tensor(np.array([exp.advantage for exp in batch_experiences]), dtype=torch.float32)
        target_v_values = torch.tensor(
            np.array([exp.target_value for exp in batch_experiences]), dtype=torch.float32
        )

        # critic value loss
        v_values = self.critic(observations).squeeze(1)
        critic_value_loss = torch.nn.functional.mse_loss(v_values, target_v_values)

        # actor policy loss
        action_logits = self.actor(observations)
        action_probs = torch.nn.functional.softmax(action_logits, dim=-1)
        action_entropy = torch.distributions.Categorical(action_probs).entropy()
        action_logprobs = torch.gather(action_probs.log(), 1, actions.unsqueeze(1)).squeeze(1)
        ratios = torch.exp(action_logprobs - old_action_logprobs)
        surr1 = ratios * advantages
        surr2 = torch.clamp(ratios, 1 - epsilon, 1 + epsilon) * advantages
        actor_policy_loss = -torch.min(surr1, surr2).mean() - entropy_coef * action_entropy.mean()

        loss = actor_policy_loss + critic_value_loss
        # update actor and critic
        self.optimizer.zero_grad()
        loss.backward()
        # clip
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.optimizer.step()

        total_loss += loss.item()
    return total_loss / num_minibatches

toyrl.ppo.PPOTrainer

PPOTrainer(config: PPOConfig)
Source code in toyrl/ppo.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def __init__(self, config: PPOConfig) -> None:
    self.config = config
    self.envs = self._make_env()
    env_dim = self.envs.single_observation_space.shape[0]
    action_num = self.envs.single_action_space.n
    actor = ActorPolicyNet(env_dim=env_dim, action_num=action_num)
    critic = CriticValueNet(env_dim=env_dim)
    optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=config.learning_rate)
    self.agent = PPOAgent(actor=actor, critic=critic, optimizer=optimizer)
    if config.log_wandb:
        wandb.init(
            # set the wandb project where this run will be logged
            project="PPO",
            name=f"[{config.env_name}]lr={config.learning_rate}",
            # track hyperparameters and run metadata
            config=asdict(config),
        )

config instance-attribute

config = config

envs instance-attribute

envs = _make_env()

agent instance-attribute

agent = PPOAgent(actor=actor, critic=critic, optimizer=optimizer)

train

train()
Source code in toyrl/ppo.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def train(self):
    batch_size = self.config.time_horizons * self.config.num_envs
    num_iteration = self.config.total_timesteps // batch_size

    global_step = 0
    observations, _ = self.envs.reset()
    for iteration in range(num_iteration):
        if self.config.anneal_learning_rate:
            frac = 1.0 - iteration / num_iteration
            lr = frac * self.config.learning_rate
            self.agent.optimizer.param_groups[0]["lr"] = lr

        # Collect experience
        for step in range(self.config.time_horizons):
            global_step += self.config.num_envs
            actions, action_logprobs = [], []
            for obs in observations:
                action, action_logprob = self.agent.act(obs)
                actions.append(action)
                action_logprobs.append(action_logprob)
            next_observations, rewards, terminateds, truncateds, infos = self.envs.step(np.array(actions))
            for env_id in range(self.config.num_envs):
                experience = Experience(
                    env_id=env_id,
                    terminated=terminateds[env_id],
                    truncated=truncateds[env_id],
                    observation=observations[env_id],
                    action=actions[env_id],
                    action_logprob=action_logprobs[env_id],
                    reward=float(rewards[env_id]),
                    next_observation=next_observations[env_id],
                )
                self.agent.add_experience(experience)
            observations = next_observations

            if "episode" in infos:
                for i in range(self.config.num_envs):
                    if infos["_episode"][i]:
                        print(f"global_step={global_step}, episodic_return={infos['episode']['r'][i]}")
                        if self.config.log_wandb:
                            wandb.log(
                                {
                                    "global_step": global_step,
                                    "episodic_return": infos["episode"]["r"][i],
                                }
                            )

        # Update policy
        total_loss = 0.0
        for _ in range(self.config.update_epochs):
            loss = self.agent.net_update(
                gamma=self.config.gamma,
                lambda_=self.config.lambda_,
                epsilon=self.config.epsilon,
                entropy_coef=self.config.entropy_coef,
                num_minibatches=self.config.num_minibatches,
            )
            total_loss += loss
        loss = total_loss / self.config.update_epochs
        if self.config.log_wandb:
            wandb.log(
                {
                    "global_step": global_step,
                    "learning_rate": self.agent.optimizer.param_groups[0]["lr"],
                    "loss": loss,
                }
            )
        # Onpolicy reset
        self.agent.reset()

  1. L. Graesser and W. L. Keng, Foundations of deep reinforcement learning: Theory and practice in python. Addison-Wesley Professional, 2019.