GRPO: Group Relative Policy Optimization DeepSeek's RL Breakthrough

Introduction

DeepSeek-R1震惊了AI世界，通过纯粹的强化学习实现了 GPT-4 级别的推理能力。这一突破的核心是 GRPO（Group Relative Policy Optimization，群体相对策略优化），这是一种创新的强化学习算法，摒弃了传统的价值网络（critic network），转而通过组内相对奖励来优化策略。

GRPO 解决了 PPO（Proximal Policy Optimization）的核心问题：复杂性、不稳定性和高内存消耗。通过巧妙的组采样设计，GRPO 实现了更高效、更稳定的训练，最终成就了 DeepSeek-R1 的推理突破。

PPO 的问题

传统 Actor-Critic 架构

PPO 属于 Actor-Critic 强化学习算法家族：

class PPOArchitecture:
    """
    Traditional PPO requires multiple networks
    """
    
    def __init__(self, state_dim, action_dim):
        # Actor: learns the policy (what to do)
        self.actor = ActorNetwork(state_dim, action_dim)
        
        # Critic: estimates future rewards (value function)
        self.critic = CriticNetwork(state_dim)
        
        # Target networks for stability
        self.target_actor = ActorNetwork(state_dim, action_dim)
        self.target_critic = CriticNetwork(state_dim)
        
        # Reference model for KL constraint
        self.ref_model = ActorNetwork(state_dim, action_dim)
        
    def ppo_loss(self, states, actions, old_log_probs, advantages):
        """
        PPO Clip Objective:
        L(θ) = E[min(r(θ) * A, clip(r(θ), 1-ε, 1+ε) * A)]
        
        Where r(θ) = π_θ(a|s) / π_θ_old(a|s)
        """
        
        # Get current policy probabilities
        new_log_probs = self.actor.get_log_prob(states, actions)
        
        # Compute probability ratio
        ratio = torch.exp(new_log_probs - old_log_probs)
        
        # Clipped surrogate objective
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1 - 0.2, 1 + 0.2) * advantages
        
        # Take minimum (pessimistic bound)
        policy_loss = -torch.min(surr1, surr2).mean()
        
        # Value function loss
        values = self.critic(states)
        value_loss = F.mse_loss(values, advantages)
        
        return policy_loss + 0.5 * value_loss

PPO 的四大挑战

ppo_problems = {
    'multiple_models': '需要4个模型: actor, critic, reference, target',
    'hyperparameters': '需要精细调优: clip epsilon, GAE lambda, value loss weight',
    'instability': '梯度可能爆炸，需要梯度裁剪和目标网络更新',
    'memory': '40GB+ GPU 内存用于 7B 模型',
    'complexity': 'GAE (Generalized Advantage Estimation) 计算复杂',
    
    # 示例代码复杂度
    'code_comparison': '''
    PPO 需要:
    - advantage = compute_gae(rewards, values, gamma=0.99, lambda=0.95)
    - ratio = (new_policy / old_policy).exp()
    - clipped_ratio = ratio.clamp(1-eps, 1+eps)
    - loss = -min(ratio * advantage, clipped_ratio * advantage)
    - loss += 0.5 * value_loss + 0.01 * entropy_loss
    '''
}

GRPO 核心原理

关键洞察

GRPO 的核心洞察是：对于同一个问题，我们可以生成多个响应，然后比较它们的相对质量，而不是学习绝对的价值函数。

def grpo_key_insight():
    """
    GRPO 关键洞察:
    
    对于每个prompt q, 我们从旧策略 π_ref 中采样 G 个响应 {o_1, o_2, ..., o_G}
    
    然后计算每个响应的奖励 r(o_i)
    
    使用组内统计量作为基线:
    - mean: 组内平均奖励
    - std: 组内奖励标准差
    
    优势函数: A_i = (r(o_i) - mean) / std
    
    这样就不需要价值网络了!
    """
    pass

GRPO 损失函数

import torch
import torch.nn.functional as F

def grpo_loss(
    policy_logits,     # 策略模型logits: [batch, group_size, seq_len, vocab]
    ref_logits,        # 参考模型logits
    rewards,           # 奖励值: [batch, group_size]
    beta: float = 0.1,
    epsilon: float = 0.2
):
    """
    GRPO 损失函数
    
    Args:
        policy_logits: 策略模型输出
        ref_logits: 参考模型（SFT）输出
        rewards: 每个响应的奖励 [batch, group_size]
        beta: KL惩罚系数
        epsilon: 裁剪参数
        
    Returns:
        loss: GRPO损失值
    """
    batch_size, group_size, seq_len, vocab_size = policy_logits.shape
    
    # 计算log概率
    policy_logprobs = F.log_softmax(policy_logits, dim=-1)
    ref_logprobs = F.log_softmax(ref_logits, dim=-1)
    
    # 获取每个响应的总log概率 (token级别求和)
    # 需要attention mask来忽略padding
    log_probs = policy_logprobs.sum(dim=(2, 3))  # [batch, group_size]
    ref_log_probs = ref_logprobs.sum(dim=(2, 3))  # [batch, group_size]
    
    # 计算组内相对奖励 (优势)
    # 对每个prompt的所有响应计算
    mean_reward = rewards.mean(dim=1, keepdim=True)  # [batch, 1]
    std_reward = rewards.std(dim=1, keepdim=True) + 1e-8  # [batch, 1]
    
    advantages = (rewards - mean_reward) / std_reward  # [batch, group_size]
    
    # 计算策略梯度项
    # log π(a_i | q) - log π_ref(a_i | q)
    policy_ref_diff = log_probs - ref_log_probs  # [batch, group_size]
    
    # 加权优势
    weighted_diff = policy_ref_diff * advantages  # [batch, group_size]
    
    # 添加KL惩罚项
    kl_penalty = (ref_log_probs - log_probs)  # [batch, group_size]
    
    # 最终损失: 最大化优势 + KL正则化
    loss = -(weighted_diff - beta * kl_penalty).mean()
    
    return loss

完整的 GRPO 实现

classGRPOTrainer:
    """
    完整的GRPO训练实现
    """
    
    def __init__(
        self,
        policy_model,      # 要训练的政策模型
        ref_model,        # 参考模型 (冻结的SFT模型)
        reward_fn,        # 奖励函数
        beta: float = 0.1,
        group_size: int = 4,
        max_length: int = 512
    ):
        self.policy_model = policy_model
        self.ref_model = ref_model
        self.reward_fn = reward_fn
        self.beta = beta
        self.group_size = group_size
        self.max_length = max_length
        
        # 冻结参考模型
        for param in ref_model.parameters():
            param.requires_grad = False
    
    def sample_responses(self, prompts):
        """
        为每个prompt采样多个响应
        """
        all_responses = []
        
        for prompt in prompts:
            # 多次采样生成多个响应
            responses = []
            for _ in range(self.group_size):
                response = self.policy_model.generate(
                    prompt,
                    max_new_tokens=self.max_length,
                    do_sample=True,
                    temperature=0.7,
                )
                responses.append(response)
            
            all_responses.append(responses)
        
        return all_responses
    
    def compute_rewards(self, prompts, responses):
        """
        计算每个响应的奖励
        """
        all_rewards = []
        
        for prompt, response_group in zip(prompts, responses):
            # 计算组内每个响应的奖励
            group_rewards = []
            for response in response_group:
                reward = self.reward_fn(prompt, response)
                group_rewards.append(reward)
            
            all_rewards.append(group_rewards)
        
        return torch.tensor(all_rewards, dtype=torch.float32)
    
    def forward_batch(self, prompts, responses):
        """
        前向传播计算loss
        """
        batch_size = len(prompts)
        
        # 准备数据
        # [batch * group_size, seq_len]
        flattened_responses = [r for group in responses for r in group]
        
        # Tokenize
        inputs = self.tokenizer(
            flattened_responses,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=self.max_length
        )
        
        # 策略模型前向
        policy_outputs = self.policy_model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask']
        )
        
        # 参考模型前向 (无梯度)
        with torch.no_grad():
            ref_outputs = self.ref_model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask']
            )
        
        # Reshape为 [batch, group_size, seq_len, vocab]
        policy_logits = policy_outputs.logits.view(
            batch_size, self.group_size, -1, self.policy_model.config.vocab_size
        )
        ref_logits = ref_outputs.logits.view(
            batch_size, self.group_size, -1, self.ref_model.config.vocab_size
        )
        
        # 计算奖励
        rewards = self.compute_rewards(prompts, responses)
        
        # 计算GRPO损失
        loss = grpo_loss(
            policy_logits,
            ref_logits,
            rewards,
            beta=self.beta
        )
        
        return loss
    
    def train_step(self, prompts):
        """
        单步训练
        """
        # 1. 采样响应
        responses = self.sample_responses(prompts)
        
        # 2. 前向传播和损失计算
        loss = self.forward_batch(prompts, responses)
        
        # 3. 反向传播
        self.optimizer.zero_grad()
        loss.backward()
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1.0)
        
        self.optimizer.step()
        
        return loss.item()

DeepSeek-R1 的应用

GRPO 在 DeepSeek-R1 中的角色

class DeepSeekR1Training:
    """
    DeepSeek-R1 使用 GRPO 进行推理能力训练
    """
    
    def __init__(self):
        self.base_model = None
        self.reward_functions = []
    
    def setup_rewards(self):
        """
        R1 使用多个奖励函数组合
        """
        # 1. 准确性奖励: 检查答案是否正确
        self.reward_functions.append(AccuracyReward())
        
        # 2. 格式奖励: 要求模型使用特定格式
        self.reward_functions.append(FormatReward())
        
        # 3. 推理步骤奖励: 检查思维过程
        self.reward_functions.append(ReasoningReward())
    
    def compute_composite_reward(self, prompt, response):
        """
        组合多个奖励
        """
        total_reward = 0.0
        
        for reward_fn in self.reward_functions:
            reward = reward_fn(prompt, response)
            total_reward += reward
        
        return total_reward
    
    def train(self, prompts):
        """
        使用GRPO训练
        """
        trainer = GRPOTrainer(
            policy_model=self.base_model,
            ref_model=self.sft_model,
            reward_fn=self.compute_composite_reward,
            group_size=16,  # DeepSeek使用更大的group
            beta=0.04       # 较小的beta
        )
        
        for step in range(10000):
            loss = trainer.train_step(prompts)
            
            if step % 100 == 0:
                print(f"Step {step}: Loss = {loss:.4f}")

奖励函数设计

class AccuracyReward:
    """
    准确性奖励: 检查最终答案是否正确
    """
    
    def __init__(self):
        self.weight = 1.0
    
    def __call__(self, prompt, response):
        # 提取答案并检查正确性
        extracted_answer = self.extract_answer(response)
        ground_truth = self.get_ground_truth(prompt)
        
        if extracted_answer == ground_truth:
            return self.weight
        else:
            return 0.0
    
    def extract_answer(self, response):
        # 从模型响应中提取答案
        # 可能需要正则表达式或特殊标记
        pass
    
    def get_ground_truth(self, prompt):
        # 从题目中获取正确答案
        pass


class FormatReward:
    """
    格式奖励: 要求模型输出包含思考过程
    """
    
    def __init__(self):
        self.weight = 0.1
    
    def __call__(self, prompt, response):
        # 检查响应是否包含 <think> 标签
        has_think = '<think>' in response and '</think>' in response
        has_answer = '<answer>' in response and '</answer>' in response
        
        if has_think and has_answer:
            return self.weight
        elif has_think or has_answer:
            return self.weight * 0.5
        else:
            return 0.0


class ReasoningReward:
    """
    推理奖励: 鼓励长推理链
    """
    
    def __init__(self):
        self.weight = 0.01
    
    def __call__(self, prompt, response):
        # 奖励更长的推理过程
        # 但要在格式正确的前提下
        think_content = self.extract_think(response)
        reasoning_length = len(think_content)
        
        # 归一化: 长度越长，奖励越高 (但有上限)
        normalized_reward = min(reasoning_length / 1000, 1.0)
        
        return self.weight * normalized_reward

性能分析

GRPO vs PPO

# 性能对比
performance_comparison = {
    'memory_usage': {
        'PPO': '40GB+ for 7B model',
        'GRPO': '20GB for 7B model',  # 50% reduction
    },
    'training_speed': {
        'PPO': '3 days on 8x A100',
        'GRPO': '1.5 days on 8x A100',  # 2x faster
    },
    'sample_efficiency': {
        'PPO': 'Uses value estimation, can be biased',
        'GRPO': 'Empirical baseline, more accurate',
    },
    'stability': {
        'PPO': 'Requires clipping, value loss weighting',
        'GRPO': 'Simple objective, more stable',
    },
    'hyperparameters': {
        'PPO': '10+ hyperparameters',
        'GRPO': '2-3 key hyperparameters (beta, group_size)',
    }
}

数学推理结果

# DeepSeek-R1 在数学推理任务上的表现
math_results = {
    'GSM8K': {
        'base_model': '15.6%',
        'PPO_trained': '52.3%',
        'GRPO_trained': '89.3%',  # 显著更高
    },
    'MATH': {
        'base_model': '10.2%',
        'PPO_trained': '28.5%',
        'GRPO_trained': '47.1%',
    }
}

实现细节

组大小选择

def optimal_group_size(task_complexity):
    """
    根据任务复杂度选择组大小
    
    Args:
        task_complexity: 任务复杂度评分 1-10
        
    Returns:
        optimal_group_size
    """
    if task_complexity <= 3:
        # 简单任务，较小组即可
        return 4
    elif task_complexity <= 6:
        # 中等复杂度
        return 8
    else:
        # 高复杂度推理任务
        return 16  # DeepSeek-R1使用16
    
    # 一般规律:
    # - 更大的组提供更准确的基线估计
    # - 但增加组大小会减少每个epoch的梯度更新次数
    # - 实践中 4-16 是常见范围

Beta 调度

def cosine_beta_schedule(total_steps, start_beta=0.1, end_beta=0.04):
    """
    Beta调度: 逐渐减少KL惩罚，允许更大的策略变化
    """
    def get_beta(step):
        if step < total_steps * 0.1:
            # 初期: 高beta，保持接近参考模型
            return start_beta
        elif step > total_steps * 0.8:
            # 末期: 低beta，允许更多探索
            return end_beta
        else:
            # 中期: 余弦退火
            progress = (step - total_steps * 0.1) / (total_steps * 0.7)
            return start_beta - (start_beta - end_beta) * (1 + torch.cos(torch.tensor(progress * torch.pi))) / 2
    
    return get_beta

高级变体: GRPO with Self-Consistency

def grpo_with_self_consistency(
    policy_model,
    ref_model,
    prompts,
    group_size=8,
    num_final_samples=16
):
    """
    结合自一致性的GRPO
    1. 生成多个响应
    2. 使用多数投票选择最一致的答案
    3. 对多数答案给予更高奖励
    """
    
    all_responses = []
    for _ in range(group_size):
        responses = policy_model.generate(prompts)
        all_responses.append(responses)
    
    # 提取所有答案
    all_answers = [[extract_answer(r) for r in group] for group in all_responses]
    
    # 多数投票
    final_answers = []
    for answer_group in all_answers:
        # 统计每个答案的出现次数
        from collections import Counter
        counts = Counter(answer_group)
        # 最常见的答案作为最终答案
        final_answer = counts.most_common(1)[0][0]
        final_answers.append(final_answer)
    
    # 奖励: 与最终一致答案匹配的响应获得更高奖励
    rewards = []
    for group_answers in all_answers:
        group_rewards = []
        majority_count = max(Counter(group_answers).values())
        for answer in group_answers:
            if answer == final_answers[0]:
                reward = 1.0
            else:
                reward = -0.1
            group_rewards.append(reward)
        rewards.append(group_rewards)
    
    # 使用标准GRPO损失
    return grpo_loss(policy_logits, ref_logits, rewards)

与 DPO 的对比

# GRPO vs DPO 对比
comparison = {
    'training_signal': {
        'DPO': '成对偏好: chosen vs rejected',
        'GRPO': '组内相对奖励: 多个响应的相对排序',
    },
    'reference_model': {
        'DPO': '必需 (计算KL)',
        'GRPO': '必需 (计算KL)',
    },
    'sampling': {
        'DPO': '每个prompt 2个响应',
        'GRPO': '每个prompt G个响应 (G >= 4)',
    },
    'reward_type': {
        'DPO': '二元偏好',
        'GRPO': '连续奖励',
    },
    'use_case': {
        'DPO': '通用偏好对齐',
        'GRPO': '推理能力增强',
    }
}

实践建议

何时使用 GRPO

# GRPO 最佳使用场景
grpo_ideal_cases = {
    'reasoning_tasks': '数学、代码、逻辑推理',
    'self_verification': '模型可以验证自己的输出',
    'rule_based_rewards': '有明确的正确性判断标准',
    'limited_memory': '无法负担PPO的内存开销',
    'quick_iteration': '需要快速实验和迭代',
    
    # 不适合:
    'subjective_preferences': '主观偏好没有明确标准',
    'complex_environments': '需要与复杂环境交互',
}

常见陷阱

# GRPO 常见问题和解决方案
common_issues = {
    'issue1': {
        'problem': '奖励方差过大',
        'solution': '增加组大小，或使用奖励归一化'
    },
    'issue2': {
        'problem': '模型开始重复响应',
        'solution': '添加重复惩罚奖励项'
    },
    'issue3': {
        'problem': 'KL散度过大',
        'solution': '增加beta值'
    },
    'issue4': {
        'problem': '训练不稳定',
        'solution': '使用梯度裁剪，减小学习率'
    }
}

结论

GRPO 代表了强化学习优化的重大突破：

内存减半: 50% 内存使用 reduction
速度翻倍: 2x 更快的训练速度
更稳定: 更少的超参数，更可靠的收敛
推理突破: 成就了 DeepSeek-R1 的数学推理能力

通过用组内相对奖励替代价值网络，GRPO 大大简化了强化学习流程，同时保持了——甚至提升了——训练效果。这一算法正在成为推理模型训练的新标准。