Introduction
DeepSeek-R1震惊了AI世界,通过纯粹的强化学习实现了 GPT-4 级别的推理能力。这一突破的核心是 GRPO(Group Relative Policy Optimization,群体相对策略优化),这是一种创新的强化学习算法,摒弃了传统的价值网络(critic network),转而通过组内相对奖励来优化策略。
GRPO 解决了 PPO(Proximal Policy Optimization)的核心问题:复杂性、不稳定性和高内存消耗。通过巧妙的组采样设计,GRPO 实现了更高效、更稳定的训练,最终成就了 DeepSeek-R1 的推理突破。
PPO 的问题
传统 Actor-Critic 架构
PPO 属于 Actor-Critic 强化学习算法家族:
class PPOArchitecture:
"""
Traditional PPO requires multiple networks
"""
def __init__(self, state_dim, action_dim):
# Actor: learns the policy (what to do)
self.actor = ActorNetwork(state_dim, action_dim)
# Critic: estimates future rewards (value function)
self.critic = CriticNetwork(state_dim)
# Target networks for stability
self.target_actor = ActorNetwork(state_dim, action_dim)
self.target_critic = CriticNetwork(state_dim)
# Reference model for KL constraint
self.ref_model = ActorNetwork(state_dim, action_dim)
def ppo_loss(self, states, actions, old_log_probs, advantages):
"""
PPO Clip Objective:
L(θ) = E[min(r(θ) * A, clip(r(θ), 1-ε, 1+ε) * A)]
Where r(θ) = π_θ(a|s) / π_θ_old(a|s)
"""
# Get current policy probabilities
new_log_probs = self.actor.get_log_prob(states, actions)
# Compute probability ratio
ratio = torch.exp(new_log_probs - old_log_probs)
# Clipped surrogate objective
surr1 = ratio * advantages
surr2 = torch.clamp(ratio, 1 - 0.2, 1 + 0.2) * advantages
# Take minimum (pessimistic bound)
policy_loss = -torch.min(surr1, surr2).mean()
# Value function loss
values = self.critic(states)
value_loss = F.mse_loss(values, advantages)
return policy_loss + 0.5 * value_loss
PPO 的四大挑战
ppo_problems = {
'multiple_models': '需要4个模型: actor, critic, reference, target',
'hyperparameters': '需要精细调优: clip epsilon, GAE lambda, value loss weight',
'instability': '梯度可能爆炸,需要梯度裁剪和目标网络更新',
'memory': '40GB+ GPU 内存用于 7B 模型',
'complexity': 'GAE (Generalized Advantage Estimation) 计算复杂',
# 示例代码复杂度
'code_comparison': '''
PPO 需要:
- advantage = compute_gae(rewards, values, gamma=0.99, lambda=0.95)
- ratio = (new_policy / old_policy).exp()
- clipped_ratio = ratio.clamp(1-eps, 1+eps)
- loss = -min(ratio * advantage, clipped_ratio * advantage)
- loss += 0.5 * value_loss + 0.01 * entropy_loss
'''
}
GRPO 核心原理
关键洞察
GRPO 的核心洞察是:对于同一个问题,我们可以生成多个响应,然后比较它们的相对质量,而不是学习绝对的价值函数。
def grpo_key_insight():
"""
GRPO 关键洞察:
对于每个prompt q, 我们从旧策略 π_ref 中采样 G 个响应 {o_1, o_2, ..., o_G}
然后计算每个响应的奖励 r(o_i)
使用组内统计量作为基线:
- mean: 组内平均奖励
- std: 组内奖励标准差
优势函数: A_i = (r(o_i) - mean) / std
这样就不需要价值网络了!
"""
pass
GRPO 损失函数
import torch
import torch.nn.functional as F
def grpo_loss(
policy_logits, # 策略模型logits: [batch, group_size, seq_len, vocab]
ref_logits, # 参考模型logits
rewards, # 奖励值: [batch, group_size]
beta: float = 0.1,
epsilon: float = 0.2
):
"""
GRPO 损失函数
Args:
policy_logits: 策略模型输出
ref_logits: 参考模型(SFT)输出
rewards: 每个响应的奖励 [batch, group_size]
beta: KL惩罚系数
epsilon: 裁剪参数
Returns:
loss: GRPO损失值
"""
batch_size, group_size, seq_len, vocab_size = policy_logits.shape
# 计算log概率
policy_logprobs = F.log_softmax(policy_logits, dim=-1)
ref_logprobs = F.log_softmax(ref_logits, dim=-1)
# 获取每个响应的总log概率 (token级别求和)
# 需要attention mask来忽略padding
log_probs = policy_logprobs.sum(dim=(2, 3)) # [batch, group_size]
ref_log_probs = ref_logprobs.sum(dim=(2, 3)) # [batch, group_size]
# 计算组内相对奖励 (优势)
# 对每个prompt的所有响应计算
mean_reward = rewards.mean(dim=1, keepdim=True) # [batch, 1]
std_reward = rewards.std(dim=1, keepdim=True) + 1e-8 # [batch, 1]
advantages = (rewards - mean_reward) / std_reward # [batch, group_size]
# 计算策略梯度项
# log π(a_i | q) - log π_ref(a_i | q)
policy_ref_diff = log_probs - ref_log_probs # [batch, group_size]
# 加权优势
weighted_diff = policy_ref_diff * advantages # [batch, group_size]
# 添加KL惩罚项
kl_penalty = (ref_log_probs - log_probs) # [batch, group_size]
# 最终损失: 最大化优势 + KL正则化
loss = -(weighted_diff - beta * kl_penalty).mean()
return loss
完整的 GRPO 实现
classGRPOTrainer:
"""
完整的GRPO训练实现
"""
def __init__(
self,
policy_model, # 要训练的政策模型
ref_model, # 参考模型 (冻结的SFT模型)
reward_fn, # 奖励函数
beta: float = 0.1,
group_size: int = 4,
max_length: int = 512
):
self.policy_model = policy_model
self.ref_model = ref_model
self.reward_fn = reward_fn
self.beta = beta
self.group_size = group_size
self.max_length = max_length
# 冻结参考模型
for param in ref_model.parameters():
param.requires_grad = False
def sample_responses(self, prompts):
"""
为每个prompt采样多个响应
"""
all_responses = []
for prompt in prompts:
# 多次采样生成多个响应
responses = []
for _ in range(self.group_size):
response = self.policy_model.generate(
prompt,
max_new_tokens=self.max_length,
do_sample=True,
temperature=0.7,
)
responses.append(response)
all_responses.append(responses)
return all_responses
def compute_rewards(self, prompts, responses):
"""
计算每个响应的奖励
"""
all_rewards = []
for prompt, response_group in zip(prompts, responses):
# 计算组内每个响应的奖励
group_rewards = []
for response in response_group:
reward = self.reward_fn(prompt, response)
group_rewards.append(reward)
all_rewards.append(group_rewards)
return torch.tensor(all_rewards, dtype=torch.float32)
def forward_batch(self, prompts, responses):
"""
前向传播计算loss
"""
batch_size = len(prompts)
# 准备数据
# [batch * group_size, seq_len]
flattened_responses = [r for group in responses for r in group]
# Tokenize
inputs = self.tokenizer(
flattened_responses,
return_tensors='pt',
padding=True,
truncation=True,
max_length=self.max_length
)
# 策略模型前向
policy_outputs = self.policy_model(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask']
)
# 参考模型前向 (无梯度)
with torch.no_grad():
ref_outputs = self.ref_model(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask']
)
# Reshape为 [batch, group_size, seq_len, vocab]
policy_logits = policy_outputs.logits.view(
batch_size, self.group_size, -1, self.policy_model.config.vocab_size
)
ref_logits = ref_outputs.logits.view(
batch_size, self.group_size, -1, self.ref_model.config.vocab_size
)
# 计算奖励
rewards = self.compute_rewards(prompts, responses)
# 计算GRPO损失
loss = grpo_loss(
policy_logits,
ref_logits,
rewards,
beta=self.beta
)
return loss
def train_step(self, prompts):
"""
单步训练
"""
# 1. 采样响应
responses = self.sample_responses(prompts)
# 2. 前向传播和损失计算
loss = self.forward_batch(prompts, responses)
# 3. 反向传播
self.optimizer.zero_grad()
loss.backward()
# 梯度裁剪
torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1.0)
self.optimizer.step()
return loss.item()
DeepSeek-R1 的应用
GRPO 在 DeepSeek-R1 中的角色
class DeepSeekR1Training:
"""
DeepSeek-R1 使用 GRPO 进行推理能力训练
"""
def __init__(self):
self.base_model = None
self.reward_functions = []
def setup_rewards(self):
"""
R1 使用多个奖励函数组合
"""
# 1. 准确性奖励: 检查答案是否正确
self.reward_functions.append(AccuracyReward())
# 2. 格式奖励: 要求模型使用特定格式
self.reward_functions.append(FormatReward())
# 3. 推理步骤奖励: 检查思维过程
self.reward_functions.append(ReasoningReward())
def compute_composite_reward(self, prompt, response):
"""
组合多个奖励
"""
total_reward = 0.0
for reward_fn in self.reward_functions:
reward = reward_fn(prompt, response)
total_reward += reward
return total_reward
def train(self, prompts):
"""
使用GRPO训练
"""
trainer = GRPOTrainer(
policy_model=self.base_model,
ref_model=self.sft_model,
reward_fn=self.compute_composite_reward,
group_size=16, # DeepSeek使用更大的group
beta=0.04 # 较小的beta
)
for step in range(10000):
loss = trainer.train_step(prompts)
if step % 100 == 0:
print(f"Step {step}: Loss = {loss:.4f}")
奖励函数设计
class AccuracyReward:
"""
准确性奖励: 检查最终答案是否正确
"""
def __init__(self):
self.weight = 1.0
def __call__(self, prompt, response):
# 提取答案并检查正确性
extracted_answer = self.extract_answer(response)
ground_truth = self.get_ground_truth(prompt)
if extracted_answer == ground_truth:
return self.weight
else:
return 0.0
def extract_answer(self, response):
# 从模型响应中提取答案
# 可能需要正则表达式或特殊标记
pass
def get_ground_truth(self, prompt):
# 从题目中获取正确答案
pass
class FormatReward:
"""
格式奖励: 要求模型输出包含思考过程
"""
def __init__(self):
self.weight = 0.1
def __call__(self, prompt, response):
# 检查响应是否包含 <think> 标签
has_think = '<think>' in response and '</think>' in response
has_answer = '<answer>' in response and '</answer>' in response
if has_think and has_answer:
return self.weight
elif has_think or has_answer:
return self.weight * 0.5
else:
return 0.0
class ReasoningReward:
"""
推理奖励: 鼓励长推理链
"""
def __init__(self):
self.weight = 0.01
def __call__(self, prompt, response):
# 奖励更长的推理过程
# 但要在格式正确的前提下
think_content = self.extract_think(response)
reasoning_length = len(think_content)
# 归一化: 长度越长,奖励越高 (但有上限)
normalized_reward = min(reasoning_length / 1000, 1.0)
return self.weight * normalized_reward
性能分析
GRPO vs PPO
# 性能对比
performance_comparison = {
'memory_usage': {
'PPO': '40GB+ for 7B model',
'GRPO': '20GB for 7B model', # 50% reduction
},
'training_speed': {
'PPO': '3 days on 8x A100',
'GRPO': '1.5 days on 8x A100', # 2x faster
},
'sample_efficiency': {
'PPO': 'Uses value estimation, can be biased',
'GRPO': 'Empirical baseline, more accurate',
},
'stability': {
'PPO': 'Requires clipping, value loss weighting',
'GRPO': 'Simple objective, more stable',
},
'hyperparameters': {
'PPO': '10+ hyperparameters',
'GRPO': '2-3 key hyperparameters (beta, group_size)',
}
}
数学推理结果
# DeepSeek-R1 在数学推理任务上的表现
math_results = {
'GSM8K': {
'base_model': '15.6%',
'PPO_trained': '52.3%',
'GRPO_trained': '89.3%', # 显著更高
},
'MATH': {
'base_model': '10.2%',
'PPO_trained': '28.5%',
'GRPO_trained': '47.1%',
}
}
实现细节
组大小选择
def optimal_group_size(task_complexity):
"""
根据任务复杂度选择组大小
Args:
task_complexity: 任务复杂度评分 1-10
Returns:
optimal_group_size
"""
if task_complexity <= 3:
# 简单任务,较小组即可
return 4
elif task_complexity <= 6:
# 中等复杂度
return 8
else:
# 高复杂度推理任务
return 16 # DeepSeek-R1使用16
# 一般规律:
# - 更大的组提供更准确的基线估计
# - 但增加组大小会减少每个epoch的梯度更新次数
# - 实践中 4-16 是常见范围
Beta 调度
def cosine_beta_schedule(total_steps, start_beta=0.1, end_beta=0.04):
"""
Beta调度: 逐渐减少KL惩罚,允许更大的策略变化
"""
def get_beta(step):
if step < total_steps * 0.1:
# 初期: 高beta,保持接近参考模型
return start_beta
elif step > total_steps * 0.8:
# 末期: 低beta,允许更多探索
return end_beta
else:
# 中期: 余弦退火
progress = (step - total_steps * 0.1) / (total_steps * 0.7)
return start_beta - (start_beta - end_beta) * (1 + torch.cos(torch.tensor(progress * torch.pi))) / 2
return get_beta
高级变体: GRPO with Self-Consistency
def grpo_with_self_consistency(
policy_model,
ref_model,
prompts,
group_size=8,
num_final_samples=16
):
"""
结合自一致性的GRPO
1. 生成多个响应
2. 使用多数投票选择最一致的答案
3. 对多数答案给予更高奖励
"""
all_responses = []
for _ in range(group_size):
responses = policy_model.generate(prompts)
all_responses.append(responses)
# 提取所有答案
all_answers = [[extract_answer(r) for r in group] for group in all_responses]
# 多数投票
final_answers = []
for answer_group in all_answers:
# 统计每个答案的出现次数
from collections import Counter
counts = Counter(answer_group)
# 最常见的答案作为最终答案
final_answer = counts.most_common(1)[0][0]
final_answers.append(final_answer)
# 奖励: 与最终一致答案匹配的响应获得更高奖励
rewards = []
for group_answers in all_answers:
group_rewards = []
majority_count = max(Counter(group_answers).values())
for answer in group_answers:
if answer == final_answers[0]:
reward = 1.0
else:
reward = -0.1
group_rewards.append(reward)
rewards.append(group_rewards)
# 使用标准GRPO损失
return grpo_loss(policy_logits, ref_logits, rewards)
与 DPO 的对比
# GRPO vs DPO 对比
comparison = {
'training_signal': {
'DPO': '成对偏好: chosen vs rejected',
'GRPO': '组内相对奖励: 多个响应的相对排序',
},
'reference_model': {
'DPO': '必需 (计算KL)',
'GRPO': '必需 (计算KL)',
},
'sampling': {
'DPO': '每个prompt 2个响应',
'GRPO': '每个prompt G个响应 (G >= 4)',
},
'reward_type': {
'DPO': '二元偏好',
'GRPO': '连续奖励',
},
'use_case': {
'DPO': '通用偏好对齐',
'GRPO': '推理能力增强',
}
}
实践建议
何时使用 GRPO
# GRPO 最佳使用场景
grpo_ideal_cases = {
'reasoning_tasks': '数学、代码、逻辑推理',
'self_verification': '模型可以验证自己的输出',
'rule_based_rewards': '有明确的正确性判断标准',
'limited_memory': '无法负担PPO的内存开销',
'quick_iteration': '需要快速实验和迭代',
# 不适合:
'subjective_preferences': '主观偏好没有明确标准',
'complex_environments': '需要与复杂环境交互',
}
常见陷阱
# GRPO 常见问题和解决方案
common_issues = {
'issue1': {
'problem': '奖励方差过大',
'solution': '增加组大小,或使用奖励归一化'
},
'issue2': {
'problem': '模型开始重复响应',
'solution': '添加重复惩罚奖励项'
},
'issue3': {
'problem': 'KL散度过大',
'solution': '增加beta值'
},
'issue4': {
'problem': '训练不稳定',
'solution': '使用梯度裁剪,减小学习率'
}
}
结论
GRPO 代表了强化学习优化的重大突破:
- 内存减半: 50% 内存使用 reduction
- 速度翻倍: 2x 更快的训练速度
- 更稳定: 更少的超参数,更可靠的收敛
- 推理突破: 成就了 DeepSeek-R1 的数学推理能力
通过用组内相对奖励替代价值网络,GRPO 大大简化了强化学习流程,同时保持了——甚至提升了——训练效果。这一算法正在成为推理模型训练的新标准。
Comments