We compute the sum of discounted future rewards induced by a trajectory, and if this is positive, we increase so as to make this trajectory more likely, otherwise we decrease . Thus, we reinforce good behaviors and reduce the chances of generating bad ones.
Algorithm 9 REINFORCE (episodic version)
Initialize policy parameters θ
repeat
Sample episode τ=(s1,a1,r1,s2,…,sT) using πθ
for k=1,…,T do
Gk=∑l=kTγl−krl
θ←θ+ηγk−1Gk∇θlogπθ(ak∣sk)
end for
until converged
Implementation
class PolicyNetwork(nn.Module): def __init__(self, state_dim, action_dim, hidden_dim=128): super(PolicyNetwork, self).__init__() self.network = nn.Sequential( nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=-1) # Outputs a valid probability distribution ) def forward(self, state): state = torch.from_numpy(state).float().unsqueeze(0) return self.network(state)class REINFORCEAgent: def __init__(self, state_dim: int, num_actions: int, lr: float, gamma: float): self.policy = PolicyNetwork(state_dim, num_actions) self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) self.gamma = gamma # Memory lists to record episode history self.log_probs = [] self.rewards = [] def sample_action(self, state) -> int: probs = self.policy(state) m = Categorical(probs) action = m.sample() self.log_probs.append(m.log_prob(action)) return action.item() def update_policy(self): discounted_rewards = [] cumulative_return = 0 # Compute returns G_t backwards from end of episode for r in reversed(self.rewards): cumulative_return = r + self.gamma * cumulative_return discounted_rewards.insert(0, cumulative_return) # Standardize returns returns = torch.tensor(discounted_rewards, dtype=torch.float32) if len(returns) > 1: returns = (returns - returns.mean()) / (returns.std() + 1e-8) # We multiply by -1 because REINFORCE is gradient ascent on J, but optimizers minimize losses. policy_loss = [] for log_prob, G_t in zip(self.log_probs, returns): policy_loss.append(-log_prob * G_t) self.optimizer.zero_grad() loss = torch.stack(policy_loss).sum() loss.backward() self.optimizer.step() self.log_probs = [] self.rewards = []def train(env, num_episodes: int): agent = REINFORCEAgent( env.state_dim, env.num_actions, lr=0.005, gamma=0.99, ) for episode in range(num_episodes): state, _ = env.reset() while True: action = agent.sample_action(state) next_state, reward, terminated, _ = env.step(action) agent.rewards.append(reward) state = next_state if terminated: break agent.update_policy()
REINFORCE with Baseline
We can recover the full REINFORCE algorithm by combining SGD with the score function estimate with a baseline as follows
Algorithm 10 REINFORCE (episodic) with value function baseline
Initialize policy parameters θ, baseline parameters w
repeat
Sample episode τ=(s1,a1,r1,s2,…,sT) using πθ
for k=1,…,T do
Gk=∑l=kTγl−krl
δk=Gk−Vw(sk)
w←w−ηwδk∇wVw(sk)
θ←θ+ηγk−1δk∇θlogπθ(ak∣sk)
end for
until converged
Implementation
class ActorCriticNetwork(nn.Module): def __init__(self, state_dim, action_dim, hidden_dim=128): super(ActorCriticNetwork, self).__init__() self.backbone = nn.Sequential( nn.Linear(state_dim, hidden_dim), nn.ReLU(), ) self.policy_head = nn.Sequential( nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=-1) ) self.value_head = nn.Linear(hidden_dim, 1) def forward(self, state): state = torch.from_numpy(state).float().unsqueeze(0) features = self.backbone(state) action_probs = self.policy_head(features) state_value = self.value_head(features).squeeze(-1) return action_probs, state_valueclass REINFORCEAgent: def __init__(self, state_dim: int, num_actions: int, lr: float, gamma: float): self.model = ActorCriticNetwork(state_dim, num_actions) self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.gamma = gamma # Memory lists to record episode history self.log_probs = [] self.values = [] self.rewards = [] def sample_action(self, state) -> int: probs, value = self.model(state) m = Categorical(probs) action = m.sample() self.log_probs.append(m.log_prob(action)) self.values.append(value) return action.item() def update_policy(self): discounted_rewards = [] cumulative_return = 0 # Compute returns G_t backwards from end of episode for r in reversed(self.rewards): cumulative_return = r + self.gamma * cumulative_return discounted_rewards.insert(0, cumulative_return) returns = torch.tensor(discounted_rewards, dtype=torch.float32) value_estimates = torch.cat(self.values) # Detach the value estimates here so the policy loss does not update the critic. # The policy should treat the advantages as fixed weights. advantages = returns - value_estimates.detach() policy_loss = [] for log_prob, advantage in zip(self.log_probs, advantages): policy_loss.append(-log_prob * advantage) # The value loss still backpropagates through the shared backbone and value head. value_loss = F.mse_loss(value_estimates, returns) self.optimizer.zero_grad() loss = torch.stack(policy_loss).sum() + value_loss loss.backward() self.optimizer.step() self.log_probs = [] self.values = [] self.rewards = []def train(env, num_episodes: int): agent = REINFORCEAgent(env.state_dim, env.num_actions, lr=0.005) for episode in range(num_episodes): state, _ = env.reset() while True: action = agent.sample_action(state) next_state, reward, terminated, _ = env.step(action) agent.rewards.append(reward) state = next_state if terminated: break agent.update_policy()
Sources
Murphy, K. (2025). Reinforcement Learning: An Overview. Chapter 3.