PPO和扩散模型结合的思路

张开发
2026/6/16 13:17:02 15 分钟阅读
PPO和扩散模型结合的思路
# PPO from scratch for CartPole 注释详尽 可视化 Diffusion/PET 扩展 硬核对接接口预设importgymimporttorchimporttorch.nnasnnimporttorch.optimasoptimfromtorch.distributionsimportCategoricalimportnumpyasnpimportmatplotlib.pyplotasplt# # ⚙️ 超参数配置# GAMMA0.99# 折扣因子LAMBDA0.95# GAE参数控制bias-variance权衡CLIP_EPS0.2# PPO裁剪参数LR_ACTOR3e-4# actor网络学习率LR_CRITIC1e-3# critic网络学习率UPDATE_EPOCHS5# 每个batch更新次数BATCH_SIZE64# mini-batch大小MAX_STEPS2048# 每轮最大采样步数TARGET_REWARD500# 达到此回报视为成功# Diffusion 相关扩展实现USE_SYNTHETICTrue# 是否引入合成经验synth_experiences[]# 合成经验缓存池# Diffusion 生成器 classDummyDiffusionGenerator:def__init__(self,obs_dim,act_dim):self.obs_dimobs_dim self.act_dimact_dimdefgenerate(self,n):data[]for_inrange(n):obsnp.random.randn(self.obs_dim)actionnp.random.randint(self.act_dim)logpnp.log(1.0/self.act_dim)rewardnp.random.randn()donenp.random.choice([0,1],p[0.9,0.1])valuenp.random.randn()data.append((obs,action,logp,reward,done,value))returndata# # Actor 网络策略# classActor(nn.Module):def__init__(self,obs_dim,act_dim):super().__init__()self.netnn.Sequential(nn.Linear(obs_dim,64),nn.Tanh(),nn.Linear(64,64),nn.Tanh(),nn.Linear(64,act_dim),nn.Softmax(dim-1)# 输出动作概率)defforward(self,obs):returnself.net(obs)# # Critic 网络价值函数# classCritic(nn.Module):def__init__(self,obs_dim):super().__init__()self.netnn.Sequential(nn.Linear(obs_dim,64),nn.Tanh(),nn.Linear(64,64),nn.Tanh(),nn.Linear(64,1)# 输出状态价值)defforward(self,obs):returnself.net(obs).squeeze(-1)# # GAE优势函数计算# defcompute_gae(rewards,dones,values,next_value):advantages[]gae0valuesvalues[next_value]forstepinreversed(range(len(rewards))):deltarewards[step]GAMMA*values[step1]*(1-dones[step])-values[step]gaedeltaGAMMA*LAMBDA*(1-dones[step])*gae advantages.insert(0,gae)returnadvantages# # ️ PPO训练主循环# deftrain():envgym.make(CartPole-v1)obs_dimenv.observation_space.shape[0]act_dimenv.action_space.n actorActor(obs_dim,act_dim)criticCritic(obs_dim)opt_actoroptim.Adam(actor.parameters(),lrLR_ACTOR)opt_criticoptim.Adam(critic.parameters(),lrLR_CRITIC)reward_history[]diffusion_genDummyDiffusionGenerator(obs_dim,act_dim)foriterationinrange(1000):# Step 1: 数据采集 obs_buf,act_buf,logp_buf,rew_buf,done_buf,val_buf[],[],[],[],[],[]obsenv.reset()ep_rew0for_inrange(MAX_STEPS):obs_tensortorch.tensor(obs,dtypetorch.float32)distCategorical(actor(obs_tensor))actiondist.sample()logpdist.log_prob(action)valuecritic(obs_tensor).item()next_obs,reward,done,_env.step(action.item())obs_buf.append(obs)act_buf.append(action.item())logp_buf.append(logp.item())rew_buf.append(reward)done_buf.append(done)val_buf.append(value)obsnext_obs ep_rewrewardifdone:obsenv.reset()reward_history.append(ep_rew)# Step 1.5: 添加合成数据Diffusion 生成器ifUSE_SYNTHETIC:synth_experiencesdiffusion_gen.generate(n256)forexpinsynth_experiences:obs_buf.append(exp[0])act_buf.append(exp[1])logp_buf.append(exp[2])rew_buf.append(exp[3])done_buf.append(exp[4])val_buf.append(exp[5])# Step 2: 计算GAE和回报 next_valuecritic(torch.tensor(obs,dtypetorch.float32)).item()adv_bufcompute_gae(rew_buf,done_buf,val_buf,next_value)ret_buf[avfora,vinzip(adv_buf,val_buf)]obs_tensortorch.tensor(obs_buf,dtypetorch.float32)act_tensortorch.tensor(act_buf)logp_old_tensortorch.tensor(logp_buf)adv_tensortorch.tensor(adv_buf,dtypetorch.float32)ret_tensortorch.tensor(ret_buf,dtypetorch.float32)adv_tensor(adv_tensor-adv_tensor.mean())/(adv_tensor.std()1e-8)for_inrange(UPDATE_EPOCHS):foriinrange(0,len(obs_tensor),BATCH_SIZE):sliceslice(i,iBATCH_SIZE)logitsactor(obs_tensor[slice])distCategorical(logits)logpdist.log_prob(act_tensor[slice])ratiotorch.exp(logp-logp_old_tensor[slice])surr1ratio*adv_tensor[slice]surr2torch.clamp(ratio,1-CLIP_EPS,1CLIP_EPS)*adv_tensor[slice]actor_loss-torch.min(surr1,surr2).mean()value_predcritic(obs_tensor[slice])critic_loss((value_pred-ret_tensor[slice])**2).mean()opt_actor.zero_grad()actor_loss.backward()opt_actor.step()opt_critic.zero_grad()critic_loss.backward()opt_critic.step()print(fIteration{iteration}: Reward {ep_rew:.2f})ifep_rewTARGET_REWARD:print( Solved!)break# 可视化训练过程plt.plot(reward_history)plt.title(PPO Training Reward)plt.xlabel(Iteration)plt.ylabel(Episode Reward)plt.grid()plt.show()train()# # ✅ PPO为什么从数据采集开始# 在每一轮训练中PPO需要根据当前策略采集最新的一批经验轨迹状态、动作、奖励等# 再通过这些数据进行策略优化。这是典型的 On-policy 策略更新范式策略和数据必须匹配。#

更多文章