#!/usr/bin/env python# coding: utf-8'''The simplest deep learning, Discrete reinforcement, Run in desktop UI''''''for LunarLander-v3 v1.0''''''python gym_discrete.py'''__credits__=["iBoxDB","Bruce Yang CL-N","2025-2"]# 3rd parts#https://pytorch.org CPU version onlyimporttorch#pip install "gymnasium[box2d]"#pip install pygameimportgymnasium#from gymnasium.envs.box2d.lunar_lander import LunarLander, LunarLanderContinuous# included in torchimportnumpyfromcopyimportdeepcopyth=torchnn=th.nnnp=numpygym=gymnasiumndarray=np.ndarrayTensor=th.Tensor#np.random.seed(0)#th.manual_seed(0)th.set_num_threads(4)th.set_default_dtype(th.float32)withth.no_grad():'''The simplest deep learning, Discrete reinforcement, Run in desktop UI'''pass#print(gym.envs.__file__) #look for the LunarLander-v2, LunarLander-v3, LunarLander-v4 ... in PC#gym.pprint_registry() gameName="LunarLander-v3"_continuous=False_wind_power=0.5enable_wind=Trueenv=gym.make(gameName,continuous=_continuous,enable_wind=enable_wind,wind_power=_wind_power)env.reset()in_dim=env.observation_space.shape[0]action_dim=env.action_space.nprint(env.action_space)print(action_dim,in_dim)device=torch.device("cpu")modelActor=nn.Sequential(nn.Linear(in_dim,256),nn.ReLU(),nn.Linear(256,256),nn.ReLU(),nn.Linear(256,action_dim),)t=list(modelActor.parameters())modelActorOptim=torch.optim.Adam(t,lr=0.01)modelActor_target=deepcopy(modelActor)modelActor_target.load_state_dict(modelActor.state_dict())soft_update_tau=0.05defsoft_update(target_net:nn.Module,current_net:nn.Module):tau=soft_update_taufortar,curinzip(target_net.parameters(),current_net.parameters()):tar.data.copy_(cur.data*tau+tar.data*(1.0-tau))explore_rate=0.25defmodelActor_getaction(state):ifth.rand(1)>explore_rate:withtorch.no_grad():action=modelActor(state).argmax(dim=-1,keepdim=True)action=action[0]else:action=torch.tensor(env.action_space.sample())returnactiondefmodelActor_getaction_human(state):withtorch.no_grad():action=modelActor(state).argmax(dim=-1,keepdim=True)action=action[0]returnactiondefrunHuman(steps):t_env=gym.make(gameName,continuous=_continuous,enable_wind=enable_wind,wind_power=_wind_power,render_mode="human",)ary_state,_=t_env.reset()foriinrange(steps):state=torch.tensor(ary_state)action=modelActor_getaction_human(state)ary_action=action.numpy()ary_state,reward,terminal,truncate,_=t_env.step(ary_action)t_env.render()#print(ary_action, reward, terminal, truncate)ifterminalortruncate:#print(reward)ary_state,_=t_env.reset()t_env.close()runHuman(2)# train_agenttrain_agent_time_total=70fortrain_agent_timeinrange(train_agent_time_total):horizon_len=10000print(f"{train_agent_time}/{train_agent_time_total} : explore_env({gameName}, {horizon_len}, Wind={enable_wind})")states=th.zeros((horizon_len,in_dim,))actions=th.zeros((horizon_len),dtype=th.int64)next_states=th.zeros((horizon_len,in_dim,))rewards=th.zeros(horizon_len)terminals=th.zeros(horizon_len)# don't use truncate heretruncates=th.zeros(horizon_len)ary_state,_=env.reset()foriinrange(horizon_len):state=torch.tensor(ary_state)action=modelActor_getaction(state)ary_action=action.detach().cpu().numpy()ary_state,reward,terminal,truncate,_=env.step(ary_action)#print(ary_action, reward, terminal, truncate)iftruncate:terminal=Truereward=-150# go to this placetarget=2-np.sqrt(state[0]*state[0]+state[1]*state[1])target*=4reward+=targetifterminal:ary_state,_=env.reset()states[i]=stateactions[i]=actionrewards[i]=rewardterminals[i]=terminaltruncates[i]=truncatenext_states[:-1]=states[1:]next_states[-1:]=torch.tensor(ary_state)states=states.detach()actions=actions.detach()rewards=rewards.detach()terminals=terminals.detach()truncates=truncates.detach()next_states=next_states.detach()undones=th.logical_not(terminals).detach()truncates=Noneprint(f"end explore_env()")gamma=0.97masks=undones*gammacriterion=torch.nn.MSELoss()forupdate_q_timeinrange(20):state_action_values=modelActor(states).gather(1,actions.unsqueeze(1))#print(state_action_values.shape)state_action_values=state_action_values.squeeze(1)next_state_values=modelActor_target(next_states).max(1).valuesexpected_state_action_values=(next_state_values*masks)+rewardsexpected_state_action_values=expected_state_action_values.detach()loss=criterion(state_action_values,expected_state_action_values)modelActorOptim.zero_grad()loss.backward()modelActorOptim.step()soft_update(modelActor_target,modelActor)print(loss.item())runHuman(200)print("End.")input("Show UI?")runHuman(25000)
Last edit: iBoxDB 2025-03-11