REINFORCE

REINFORCE

The REINFORCE algorithm uses the estimate of the gradient of the policy value together with SGD to fit a policy.

Intuition

We compute the sum of discounted future rewards induced by a trajectory, and if this is positive, we increase so as to make this trajectory more likely, otherwise we decrease . Thus, we reinforce good behaviors and reduce the chances of generating bad ones.

Algorithm 9 REINFORCE (episodic version)

Initialize policy parameters θ\mathbf{\theta}

repeat

Sample episode τ=(s1,a1,r1,s2,,sT)\tau = (s_1,a_1,r_1,s_2,\dots,s_T) using πθ\pi_{\mathbf{\theta}}

for k=1,,Tk=1,\dots,T do

Gk=l=kTγlkrlG_k = \sum_{l=k}^T \gamma^{l-k} r_l

θθ+ηγk1Gkθlogπθ(aksk)\mathbf{\theta} \gets \mathbf{\theta} + \eta \gamma^{k-1} G_k \nabla_{\mathbf{\theta}} \log \pi_{\mathbf{\theta}}(a_k \mid s_k)

end for

until converged

Implementation

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(PolicyNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1) # Outputs a valid probability distribution
        )
 
    def forward(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        return self.network(state)
 
class REINFORCEAgent:
    def __init__(self, state_dim: int, num_actions: int, lr: float, gamma: float):
        self.policy = PolicyNetwork(state_dim, num_actions)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        # Memory lists to record episode history
        self.log_probs = []
        self.rewards = []
        
    def sample_action(self, state) -> int:
        probs = self.policy(state)
        m = Categorical(probs)
        action = m.sample()
        self.log_probs.append(m.log_prob(action))
        return action.item()
        
    def update_policy(self):
        discounted_rewards = []
        cumulative_return = 0
        # Compute returns G_t backwards from end of episode
        for r in reversed(self.rewards):
            cumulative_return = r + self.gamma * cumulative_return
            discounted_rewards.insert(0, cumulative_return)
        # Standardize returns
        returns = torch.tensor(discounted_rewards, dtype=torch.float32)
        if len(returns) > 1:
            returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        # We multiply by -1 because REINFORCE is gradient ascent on J, but optimizers minimize losses.
        policy_loss = []
        for log_prob, G_t in zip(self.log_probs, returns):
            policy_loss.append(-log_prob * G_t)
 
        self.optimizer.zero_grad()
        loss = torch.stack(policy_loss).sum()
        loss.backward()
        self.optimizer.step()
 
        self.log_probs = []
        self.rewards = []
        
def train(env, num_episodes: int):
    agent = REINFORCEAgent(
        env.state_dim,
        env.num_actions,
        lr=0.005,
        gamma=0.99,
    )
    for episode in range(num_episodes):
        state, _ = env.reset()
        while True:
            action = agent.sample_action(state)
            next_state, reward, terminated, _ = env.step(action)
            agent.rewards.append(reward)
            state = next_state
            if terminated:
                break
        agent.update_policy()

REINFORCE with Baseline

We can recover the full REINFORCE algorithm by combining SGD with the score function estimate with a baseline as follows

Algorithm 10 REINFORCE (episodic) with value function baseline

Initialize policy parameters θ\mathbf{\theta}, baseline parameters w\mathbf{w}

repeat

Sample episode τ=(s1,a1,r1,s2,,sT)\tau = (s_1,a_1,r_1,s_2,\dots,s_T) using πθ\pi_{\mathbf{\theta}}

for k=1,,Tk=1,\dots,T do

Gk=l=kTγlkrlG_k = \sum_{l=k}^T \gamma^{l-k} r_l

δk=GkVw(sk)\delta_k = G_k - V_{\mathbf{w}}(s_k)

wwηwδkwVw(sk)\mathbf{w} \gets \mathbf{w} - \eta_{\mathbf{w}} \delta_k \nabla_{\mathbf{w}} V_{\mathbf{w}} (s_k)

θθ+ηγk1δkθlogπθ(aksk)\mathbf{\theta} \gets \mathbf{\theta} + \eta \gamma^{k-1} \delta_k \nabla_{\mathbf{\theta}} \log \pi_{\mathbf{\theta}}(a_k \mid s_k)

end for

until converged

Implementation

class ActorCriticNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(ActorCriticNetwork, self).__init__()
        self.backbone = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
        )
 
        self.policy_head = nn.Sequential(
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
 
        self.value_head = nn.Linear(hidden_dim, 1)
 
    def forward(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        features = self.backbone(state)
        action_probs = self.policy_head(features)
        state_value = self.value_head(features).squeeze(-1)
        return action_probs, state_value
 
class REINFORCEAgent:
    def __init__(self, state_dim: int, num_actions: int, lr: float, gamma: float):
        self.model = ActorCriticNetwork(state_dim, num_actions)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = gamma
        # Memory lists to record episode history
        self.log_probs = []
        self.values = []
        self.rewards = []
        
    def sample_action(self, state) -> int:
        probs, value = self.model(state)
        m = Categorical(probs)
        action = m.sample()
        self.log_probs.append(m.log_prob(action))
        self.values.append(value)
        return action.item()
        
    def update_policy(self):
        discounted_rewards = []
        cumulative_return = 0
        # Compute returns G_t backwards from end of episode
        for r in reversed(self.rewards):
            cumulative_return = r + self.gamma * cumulative_return
            discounted_rewards.insert(0, cumulative_return)
        returns = torch.tensor(discounted_rewards, dtype=torch.float32)
 
        value_estimates = torch.cat(self.values)
 
        # Detach the value estimates here so the policy loss does not update the critic.
        # The policy should treat the advantages as fixed weights.
        advantages = returns - value_estimates.detach()
 
        policy_loss = []
        for log_prob, advantage in zip(self.log_probs, advantages):
            policy_loss.append(-log_prob * advantage)
 
        # The value loss still backpropagates through the shared backbone and value head.
        value_loss = F.mse_loss(value_estimates, returns)
 
        self.optimizer.zero_grad()
        loss = torch.stack(policy_loss).sum() + value_loss
        loss.backward()
        self.optimizer.step()
 
        self.log_probs = []
        self.values = []
        self.rewards = []
        
def train(env, num_episodes: int):
    agent = REINFORCEAgent(env.state_dim, env.num_actions, lr=0.005)
    for episode in range(num_episodes):
        state, _ = env.reset()
        while True:
            action = agent.sample_action(state)
            next_state, reward, terminated, _ = env.step(action)
            agent.rewards.append(reward)
            state = next_state
            if terminated:
                break
        agent.update_policy()

Sources

  • Murphy, K. (2025). Reinforcement Learning: An Overview. Chapter 3.