Skip to content

A2C

a2c

Figure: A2C algorithm pseudocode 1

toyrl.a2c.default_config module-attribute

default_config = A2CConfig(env_name='CartPole-v1', render_mode=None, solved_threshold=475.0, num_episodes=100000, learning_rate=0.002, log_wandb=True)

toyrl.a2c.trainer module-attribute

toyrl.a2c.A2CConfig dataclass

A2CConfig(env_name: str = 'CartPole-v1', render_mode: str | None = None, solved_threshold: float = 475.0, gamma: float = 0.999, lambda_: float = 0.98, value_loss_coef: float = 0.5, policy_loss_coef: float = 0.5, entropy_coef: float = 0.01, num_episodes: int = 500, learning_rate: float = 0.01, eval_episodes: int = 10, eval_interval: int = 100, log_wandb: bool = False)

Configuration for A2C algorithm.

env_name class-attribute instance-attribute

env_name: str = 'CartPole-v1'

render_mode class-attribute instance-attribute

render_mode: str | None = None

solved_threshold class-attribute instance-attribute

solved_threshold: float = 475.0

gamma class-attribute instance-attribute

gamma: float = 0.999

lambda_ class-attribute instance-attribute

lambda_: float = 0.98

value_loss_coef class-attribute instance-attribute

value_loss_coef: float = 0.5

policy_loss_coef class-attribute instance-attribute

policy_loss_coef: float = 0.5

entropy_coef class-attribute instance-attribute

entropy_coef: float = 0.01

num_episodes class-attribute instance-attribute

num_episodes: int = 500

learning_rate class-attribute instance-attribute

learning_rate: float = 0.01

eval_episodes class-attribute instance-attribute

eval_episodes: int = 10

eval_interval class-attribute instance-attribute

eval_interval: int = 100

log_wandb class-attribute instance-attribute

log_wandb: bool = False

toyrl.a2c.ActorCriticNet

ActorCriticNet(env_dim: int, action_num: int)

Bases: Module

Source code in toyrl/a2c.py
37
38
39
40
41
42
43
44
45
46
def __init__(self, env_dim: int, action_num: int) -> None:
    super().__init__()
    self.env_dim = env_dim
    self.action_num = action_num
    self.shared_layers = nn.Sequential(
        nn.Linear(env_dim, 64),
        nn.ReLU(),
    )
    self.policy_head = nn.Linear(64, action_num)
    self.value_head = nn.Linear(64, 1)

env_dim instance-attribute

env_dim = env_dim

action_num instance-attribute

action_num = action_num

shared_layers instance-attribute

shared_layers = Sequential(Linear(env_dim, 64), ReLU())

policy_head instance-attribute

policy_head = Linear(64, action_num)

value_head instance-attribute

value_head = Linear(64, 1)

forward

forward(x: Tensor) -> tuple[Tensor, Tensor]
Source code in toyrl/a2c.py
48
49
50
51
52
def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    x = self.shared_layers(x)
    policy_action_logits = self.policy_head(x)
    v_value = self.value_head(x)
    return policy_action_logits, v_value

toyrl.a2c.Experience dataclass

Experience(observation: Any, action: Any, reward: float, next_observation: Any, terminated: bool, truncated: bool)

observation instance-attribute

observation: Any

action instance-attribute

action: Any

reward instance-attribute

reward: float

next_observation instance-attribute

next_observation: Any

terminated instance-attribute

terminated: bool

truncated instance-attribute

truncated: bool

toyrl.a2c.ReplayBuffer dataclass

ReplayBuffer(buffer: list[Experience] = list())

buffer class-attribute instance-attribute

buffer: list[Experience] = field(default_factory=list)

__len__

__len__() -> int
Source code in toyrl/a2c.py
69
70
def __len__(self) -> int:
    return len(self.buffer)

add_experience

add_experience(experience: Experience) -> None
Source code in toyrl/a2c.py
72
73
def add_experience(self, experience: Experience) -> None:
    self.buffer.append(experience)

reset

reset() -> None
Source code in toyrl/a2c.py
75
76
def reset(self) -> None:
    self.buffer = []

sample

sample() -> list[Experience]
Source code in toyrl/a2c.py
78
79
def sample(self) -> list[Experience]:
    return self.buffer

total_reward

total_reward() -> float
Source code in toyrl/a2c.py
81
82
def total_reward(self) -> float:
    return sum(experience.reward for experience in self.buffer)

toyrl.a2c.Agent

Agent(net: Module, optimizer: Optimizer)
Source code in toyrl/a2c.py
86
87
88
89
def __init__(self, net: nn.Module, optimizer: torch.optim.Optimizer) -> None:
    self._net = net
    self._optimizer = optimizer
    self._replay_buffer = ReplayBuffer()

onpolicy_reset

onpolicy_reset() -> None
Source code in toyrl/a2c.py
91
92
def onpolicy_reset(self) -> None:
    self._replay_buffer.reset()

add_experience

add_experience(experience: Experience) -> None
Source code in toyrl/a2c.py
94
95
def add_experience(self, experience: Experience) -> None:
    self._replay_buffer.add_experience(experience)

get_buffer_total_reward

get_buffer_total_reward() -> float
Source code in toyrl/a2c.py
97
98
def get_buffer_total_reward(self) -> float:
    return self._replay_buffer.total_reward()

act

act(observation: ndarray, eval: bool = False) -> int
Source code in toyrl/a2c.py
100
101
102
103
104
105
106
107
108
def act(self, observation: np.ndarray, eval: bool = False) -> int:
    x = torch.from_numpy(observation.astype(np.float32))
    with torch.no_grad():
        action_logits, _ = self._net(x)
    next_action_dist = torch.distributions.Categorical(logits=action_logits)
    action = next_action_dist.sample()
    if eval:
        action = next_action_dist.probs.argmax(dim=-1)
    return action.item()

net_update

net_update(gamma: float, lambda_: float, value_loss_coef: float, policy_loss_coef: float, entropy_coef: float) -> tuple[float, float, float]
Source code in toyrl/a2c.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def net_update(
    self, gamma: float, lambda_: float, value_loss_coef: float, policy_loss_coef: float, entropy_coef: float
) -> tuple[float, float, float]:
    experiences = self._replay_buffer.sample()

    observations = torch.tensor([exp.observation for exp in experiences])
    actions = torch.tensor([exp.action for exp in experiences])
    rewards = torch.tensor([exp.reward for exp in experiences]).unsqueeze(1)
    next_observations = torch.tensor([exp.next_observation for exp in experiences])
    terminateds = torch.tensor([exp.terminated for exp in experiences], dtype=torch.float32).unsqueeze(1)

    # calculate predicted V-values
    policy_action_logits, v_values = self._net(observations)
    # n-step return
    v_targets = torch.zeros_like(rewards)
    for t in reversed(range(len(experiences) - 1)):
        v_targets[t] = rewards[t] + gamma * v_targets[t + 1] * (1 - terminateds[t])
    # calculate value loss
    value_loss = nn.functional.mse_loss(v_values, v_targets)

    # calculate advantages by GAE
    with torch.no_grad():
        _, v_values_next = self._net(next_observations)
    deltas = rewards + gamma * v_values_next * (1 - terminateds) - v_values
    advantages = deltas.clone()
    for t in reversed(range(len(experiences) - 1)):
        advantages[t] = deltas[t] + gamma * lambda_ * advantages[t + 1] * (1 - terminateds[t])
    advantages = advantages / (advantages.std() + 1e-8)
    advantages = advantages.detach()

    action_dist = torch.distributions.Categorical(logits=policy_action_logits)
    action_entropy = action_dist.entropy().mean()
    action_log_probs = action_dist.log_prob(actions)
    # calculate policy loss
    policy_loss = -action_log_probs * advantages
    policy_loss = torch.mean(policy_loss)

    loss = value_loss * value_loss_coef + policy_loss * policy_loss_coef - entropy_coef * action_entropy

    # update
    self._optimizer.zero_grad()
    loss.backward()
    self._optimizer.step()
    return loss.item(), action_entropy.item(), advantages.mean().item()

toyrl.a2c.A2CTrainer

A2CTrainer(config: A2CConfig)
Source code in toyrl/a2c.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def __init__(self, config: A2CConfig) -> None:
    self.config = config
    self.env = gym.make(config.env_name, render_mode=config.render_mode)
    env_dim = self.env.observation_space.shape[0]  # type: ignore[index]
    action_num = self.env.action_space.n  # type: ignore[attr-defined]
    net = ActorCriticNet(env_dim=env_dim, action_num=action_num)
    optimizer = optim.Adam(net.parameters(), lr=config.learning_rate)
    self.agent = Agent(net=net, optimizer=optimizer)

    self.num_episodes = config.num_episodes
    self.gamma = config.gamma
    self.lambda_ = config.lambda_
    self.value_loss_coef = config.value_loss_coef
    self.policy_loss_coef = config.policy_loss_coef
    self.entropy_coef = config.entropy_coef
    self.solved_threshold = config.solved_threshold
    if config.log_wandb:
        wandb.init(
            # set the wandb project where this run will be logged
            project="A2C",
            name=f"[{config.env_name}]lr={config.learning_rate}",
            # track hyperparameters and run metadata
            config=asdict(config),
        )

config instance-attribute

config = config

env instance-attribute

env = make(env_name, render_mode=render_mode)

agent instance-attribute

agent = Agent(net=net, optimizer=optimizer)

num_episodes instance-attribute

num_episodes = num_episodes

gamma instance-attribute

gamma = gamma

lambda_ instance-attribute

lambda_ = lambda_

value_loss_coef instance-attribute

value_loss_coef = value_loss_coef

policy_loss_coef instance-attribute

policy_loss_coef = policy_loss_coef

entropy_coef instance-attribute

entropy_coef = entropy_coef

solved_threshold instance-attribute

solved_threshold = solved_threshold

train

train() -> None
Source code in toyrl/a2c.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def train(self) -> None:
    for i, episode in enumerate(range(self.num_episodes)):
        observation, _ = self.env.reset()
        terminated, truncated = False, False
        while not (terminated or truncated):
            action = self.agent.act(observation)
            next_observation, reward, terminated, truncated, _ = self.env.step(action)
            experience = Experience(
                observation=observation,
                action=action,
                reward=float(reward),
                terminated=terminated,
                truncated=truncated,
                next_observation=next_observation,
            )
            self.agent.add_experience(experience)
            observation = next_observation
            if self.config.render_mode is not None:
                self.env.render()
        loss, action_entropy, advantages_mean = self.agent.net_update(
            gamma=self.gamma,
            lambda_=self.lambda_,
            value_loss_coef=self.value_loss_coef,
            policy_loss_coef=self.policy_loss_coef,
            entropy_coef=self.entropy_coef,
        )
        total_reward = self.agent.get_buffer_total_reward()
        solved = total_reward > self.solved_threshold
        self.agent.onpolicy_reset()
        print(
            f"Episode {episode}, total_reward: {total_reward}, solved: {solved}, loss: {loss}, "
            f"action_entropy: {action_entropy}, advantages_mean: {advantages_mean}"
        )
        if self.config.log_wandb:
            wandb.log(
                {
                    "episode": episode,
                    "loss": loss,
                    "total_reward": total_reward,
                    "action_entropy": action_entropy,
                    "advantages_mean": advantages_mean,
                }
            )

        if i % self.config.eval_interval == 0:
            eval_reward = self.evaluate(self.config.eval_episodes)
            print(f"Episode {episode}, Eval reward: {eval_reward}")
            if self.config.log_wandb:
                wandb.log({"eval_reward": eval_reward, "episode": episode})

evaluate

evaluate(num_episodes: int) -> float
Source code in toyrl/a2c.py
232
233
234
235
236
237
238
239
240
241
242
def evaluate(self, num_episodes: int) -> float:
    total_reward = 0.0
    for _ in range(num_episodes):
        observation, _ = self.env.reset()
        terminated, truncated = False, False
        while not (terminated or truncated):
            action = self.agent.act(observation, eval=True)
            next_observation, reward, terminated, truncated, _ = self.env.step(action)
            observation = next_observation
            total_reward += float(reward)
    return total_reward / num_episodes

  1. L. Graesser and W. L. Keng, Foundations of deep reinforcement learning: Theory and practice in python. Addison-Wesley Professional, 2019.