Skip to content

REINFORCE

reinforce

Figure: REINFORCE algorithm pseudocode 1

toyrl.reinforce.default_config module-attribute

default_config = ReinforceConfig(env_name='CartPole-v1', render_mode=None, solved_threshold=475.0, num_episodes=100000, learning_rate=0.002, with_baseline=True, log_wandb=True)

toyrl.reinforce.trainer module-attribute

toyrl.reinforce.ReinforceConfig dataclass

ReinforceConfig(env_name: str = 'CartPole-v1', render_mode: str | None = None, solved_threshold: float = 475.0, gamma: float = 0.999, num_episodes: int = 500, learning_rate: float = 0.01, with_baseline: bool = True, log_wandb: bool = False)

Configuration for REINFORCE algorithm.

env_name class-attribute instance-attribute

env_name: str = 'CartPole-v1'

render_mode class-attribute instance-attribute

render_mode: str | None = None

solved_threshold class-attribute instance-attribute

solved_threshold: float = 475.0

gamma class-attribute instance-attribute

gamma: float = 0.999

num_episodes class-attribute instance-attribute

num_episodes: int = 500

learning_rate class-attribute instance-attribute

learning_rate: float = 0.01

with_baseline class-attribute instance-attribute

with_baseline: bool = True

log_wandb class-attribute instance-attribute

log_wandb: bool = False

toyrl.reinforce.PolicyNet

PolicyNet(env_dim: int, action_num: int)

Bases: Module

A simple policy network for REINFORCE.

Source code in toyrl/reinforce.py
32
33
34
35
36
37
38
39
40
def __init__(self, env_dim: int, action_num: int) -> None:
    super().__init__()
    layers = [
        nn.Linear(env_dim, 64),
        nn.ReLU(),
        nn.Linear(64, action_num),
    ]
    self.model = nn.Sequential(*layers)
    self.train()

model instance-attribute

model = Sequential(*layers)

forward

forward(x: Tensor) -> Tensor
Source code in toyrl/reinforce.py
42
43
def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.model(x)

toyrl.reinforce.Experience dataclass

Experience(observation: Any, action: Any, action_log_prob: Tensor, reward: float, next_observation: Any, terminated: bool, truncated: bool)

An experience for REINFORCE.

observation instance-attribute

observation: Any

action instance-attribute

action: Any

action_log_prob instance-attribute

action_log_prob: Tensor

reward instance-attribute

reward: float

next_observation instance-attribute

next_observation: Any

terminated instance-attribute

terminated: bool

truncated instance-attribute

truncated: bool

toyrl.reinforce.ReplayBuffer dataclass

ReplayBuffer(buffer: list[Experience] = list())

A replay buffer for REINFORCE.

buffer class-attribute instance-attribute

buffer: list[Experience] = field(default_factory=list)

__len__

__len__() -> int
Source code in toyrl/reinforce.py
65
66
def __len__(self) -> int:
    return len(self.buffer)

add_experience

add_experience(experience: Experience) -> None
Source code in toyrl/reinforce.py
68
69
def add_experience(self, experience: Experience) -> None:
    self.buffer.append(experience)

reset

reset() -> None
Source code in toyrl/reinforce.py
71
72
def reset(self) -> None:
    self.buffer = []

sample

sample() -> list[Experience]
Source code in toyrl/reinforce.py
74
75
def sample(self) -> list[Experience]:
    return self.buffer

total_reward

total_reward() -> float
Source code in toyrl/reinforce.py
77
78
def total_reward(self) -> float:
    return sum(experience.reward for experience in self.buffer)

toyrl.reinforce.Agent

Agent(policy_net: Module, optimizer: Optimizer)

An agent for REINFORCE.

Source code in toyrl/reinforce.py
84
85
86
87
def __init__(self, policy_net: nn.Module, optimizer: torch.optim.Optimizer) -> None:
    self._policy_net = policy_net
    self._optimizer = optimizer
    self._replay_buffer = ReplayBuffer()

onpolicy_reset

onpolicy_reset() -> None
Source code in toyrl/reinforce.py
89
90
def onpolicy_reset(self) -> None:
    self._replay_buffer.reset()

add_experience

add_experience(experience: Experience) -> None
Source code in toyrl/reinforce.py
92
93
def add_experience(self, experience: Experience) -> None:
    self._replay_buffer.add_experience(experience)

get_buffer_total_reward

get_buffer_total_reward() -> float
Source code in toyrl/reinforce.py
95
96
def get_buffer_total_reward(self) -> float:
    return self._replay_buffer.total_reward()

act

act(observation: floating) -> tuple[int, Tensor]
Source code in toyrl/reinforce.py
 98
 99
100
101
102
103
104
def act(self, observation: np.floating) -> tuple[int, torch.Tensor]:
    x = torch.from_numpy(observation.astype(np.float32))
    logits = self._policy_net(x)
    next_action_dist = torch.distributions.Categorical(logits=logits)
    action = next_action_dist.sample()
    action_log_prob = next_action_dist.log_prob(action)
    return action.item(), action_log_prob

policy_update

policy_update(gamma: float, with_baseline: bool) -> float
Source code in toyrl/reinforce.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def policy_update(self, gamma: float, with_baseline: bool) -> float:
    experiences = self._replay_buffer.sample()
    # returns
    T = len(experiences)
    returns = torch.zeros(T)
    future_ret = 0.0
    for t in reversed(range(T)):
        future_ret = experiences[t].reward + gamma * future_ret
        returns[t] = future_ret
    # baseline
    if with_baseline:
        returns -= returns.mean()

    # log_probs
    action_log_probs = [exp.action_log_prob for exp in experiences]
    log_probs = torch.stack(action_log_probs)
    # loss
    loss = -log_probs * returns
    loss = torch.sum(loss)
    # update
    self._optimizer.zero_grad()
    loss.backward()
    self._optimizer.step()
    return loss.item()

toyrl.reinforce.ReinforceTrainer

ReinforceTrainer(config: ReinforceConfig)

A trainer for REINFORCE.

Source code in toyrl/reinforce.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def __init__(self, config: ReinforceConfig) -> None:
    self.config = config
    self.env = gym.make(config.env_name, render_mode=config.render_mode)
    env_dim = self.env.observation_space.shape[0]  # type: ignore[index]
    action_num = self.env.action_space.n  # type: ignore[attr-defined]
    policy_net = PolicyNet(env_dim=env_dim, action_num=action_num)
    optimizer = optim.Adam(policy_net.parameters(), lr=config.learning_rate)
    self.agent = Agent(policy_net=policy_net, optimizer=optimizer)

    self.num_episodes = config.num_episodes
    self.gamma = config.gamma
    self.with_baseline = config.with_baseline
    self.solved_threshold = config.solved_threshold
    if config.log_wandb:
        wandb.init(
            # set the wandb project where this run will be logged
            project="Reinforce",
            name=f"[{config.env_name}]lr={config.learning_rate}, baseline={config.with_baseline}",
            # track hyperparameters and run metadata
            config=asdict(config),
        )

config instance-attribute

config = config

env instance-attribute

env = make(env_name, render_mode=render_mode)

agent instance-attribute

agent = Agent(policy_net=policy_net, optimizer=optimizer)

num_episodes instance-attribute

num_episodes = num_episodes

gamma instance-attribute

gamma = gamma

with_baseline instance-attribute

with_baseline = with_baseline

solved_threshold instance-attribute

solved_threshold = solved_threshold

train

train() -> None

Train the agent.

Source code in toyrl/reinforce.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def train(self) -> None:
    """Train the agent."""
    for epi in range(self.num_episodes):
        observation, _ = self.env.reset()
        terminated, truncated = False, False
        while not (terminated or truncated):
            action, action_log_prob = self.agent.act(observation)
            next_observation, reward, terminated, truncated, _ = self.env.step(action)
            experience = Experience(
                observation=observation,
                action=action,
                action_log_prob=action_log_prob,
                reward=float(reward),
                terminated=terminated,
                truncated=truncated,
                next_observation=next_observation,
            )
            self.agent.add_experience(experience)
            observation = next_observation
            if self.config.render_mode is not None:
                self.env.render()
        loss = self.agent.policy_update(
            gamma=self.gamma,
            with_baseline=self.with_baseline,
        )
        total_reward = self.agent.get_buffer_total_reward()
        solved = total_reward > self.solved_threshold
        self.agent.onpolicy_reset()
        print(f"Episode {epi}, loss: {loss}, total_reward: {total_reward}, solved: {solved}")
        if self.config.log_wandb:
            wandb.log(
                {
                    "episode": epi,
                    "loss": loss,
                    "total_reward": total_reward,
                }
            )

  1. L. Graesser and W. L. Keng, Foundations of deep reinforcement learning: Theory and practice in python. Addison-Wesley Professional, 2019.