iBoxDB - 2025-03-11
#!/usr/bin/env python
# coding: utf-8

'''The simplest deep learning, Discrete reinforcement, Run in desktop UI'''
'''for LunarLander-v3 v1.0'''
'''python gym_discrete.py'''

__credits__ = ["iBoxDB", "Bruce Yang CL-N", "2025-2"]

# 3rd parts
#https://pytorch.org  CPU version only
import torch 
#pip install "gymnasium[box2d]"
#pip install pygame
import gymnasium 
#from gymnasium.envs.box2d.lunar_lander import LunarLander, LunarLanderContinuous

# included in torch
import numpy
from copy import deepcopy

th = torch
nn = th.nn
np = numpy
gym = gymnasium

ndarray = np.ndarray
Tensor = th.Tensor

#np.random.seed(0)
#th.manual_seed(0)

th.set_num_threads(4)
th.set_default_dtype(th.float32)

with th.no_grad():
    '''The simplest deep learning, Discrete reinforcement, Run in desktop UI'''
    pass

#print(gym.envs.__file__)   

#look for the LunarLander-v2, LunarLander-v3, LunarLander-v4 ... in PC
#gym.pprint_registry() 
gameName = "LunarLander-v3"
_continuous = False
_wind_power = 0.5
enable_wind = True
env = gym.make(gameName, continuous=_continuous,  enable_wind=enable_wind,
               wind_power=_wind_power)

env.reset() 

in_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
print(env.action_space)
print(action_dim, in_dim)

device = torch.device("cpu")


modelActor = nn.Sequential(
    nn.Linear(in_dim,256),
    nn.ReLU(),
    nn.Linear(256,256),
    nn.ReLU(), 
    nn.Linear(256,action_dim),   
) 
t = list(modelActor.parameters())
modelActorOptim = torch.optim.Adam(t,lr=0.01)

modelActor_target = deepcopy(modelActor)
modelActor_target.load_state_dict(modelActor.state_dict())

soft_update_tau = 0.05
def soft_update(target_net: nn.Module, current_net: nn.Module):
    tau = soft_update_tau
    for tar, cur in zip(target_net.parameters(), current_net.parameters()):
        tar.data.copy_(cur.data * tau + tar.data * (1.0 - tau))


explore_rate = 0.25
def modelActor_getaction(state):
    if th.rand(1) > explore_rate:
        with torch.no_grad():
            action = modelActor(state).argmax(dim=-1, keepdim=True) 
            action = action[0]
    else:
        action = torch.tensor(env.action_space.sample())
    return action

def modelActor_getaction_human(state):
    with torch.no_grad():
        action = modelActor(state).argmax(dim=-1, keepdim=True) 
        action = action[0]
    return action 

def runHuman(steps):
    t_env = gym.make( gameName , continuous=_continuous, enable_wind=enable_wind,
                      wind_power=_wind_power,
                      render_mode="human",) 
    ary_state, _ = t_env.reset() 
    for i in range(steps): 
        state = torch.tensor(ary_state)
        action = modelActor_getaction_human(state)    
        ary_action = action.numpy()

        ary_state, reward, terminal, truncate, _ = t_env.step(ary_action)
        t_env.render()
        #print(ary_action, reward, terminal, truncate)
        if terminal or truncate : 
          #print(reward)
          ary_state, _ = t_env.reset()         
    t_env.close()  

runHuman(2)



# train_agent
train_agent_time_total = 70
for train_agent_time in range(train_agent_time_total): 
    horizon_len = 10000

    print(f"{train_agent_time}/{train_agent_time_total} : explore_env({gameName}, {horizon_len}, Wind={enable_wind})")

    states = th.zeros((horizon_len,in_dim,))
    actions = th.zeros((horizon_len), dtype=th.int64) 
    next_states = th.zeros((horizon_len,in_dim,))
    rewards = th.zeros(horizon_len)
    terminals = th.zeros(horizon_len)
    # don't use truncate here
    truncates = th.zeros(horizon_len)


    ary_state, _ = env.reset()
    for i in range(horizon_len):
        state = torch.tensor(ary_state)
        action = modelActor_getaction(state)
        ary_action = action.detach().cpu().numpy()
        ary_state, reward, terminal, truncate, _ = env.step(ary_action)
        #print(ary_action, reward, terminal, truncate)
        if truncate :
            terminal = True
            reward = -150

        # go to this place
        target = 2 - np.sqrt(state[0] * state[0] + state[1] * state[1])
        target *= 4

        reward += target

        if terminal: 
            ary_state, _ = env.reset()

        states[i] = state
        actions[i] = action 
        rewards[i] = reward
        terminals[i] = terminal
        truncates[i] = truncate

    next_states[:-1] = states[1:]
    next_states[-1:] = torch.tensor(ary_state)

    states = states.detach()
    actions = actions.detach()
    rewards = rewards.detach()
    terminals = terminals.detach()
    truncates = truncates.detach()
    next_states = next_states.detach()

    undones = th.logical_not(terminals).detach()
    truncates = None
    print(f"end explore_env()")

    gamma = 0.97
    masks = undones * gamma
    criterion = torch.nn.MSELoss()        


    for update_q_time in range(20):
        state_action_values = modelActor(states).gather(1, actions.unsqueeze(1))
        #print(state_action_values.shape)
        state_action_values = state_action_values.squeeze(1)

        next_state_values = modelActor_target(next_states).max(1).values
        expected_state_action_values = (next_state_values * masks) + rewards
        expected_state_action_values = expected_state_action_values.detach()

        loss = criterion(state_action_values, expected_state_action_values)
        modelActorOptim.zero_grad()
        loss.backward()
        modelActorOptim.step() 

        soft_update(modelActor_target,modelActor)        
        print( loss.item() )

    runHuman(200)

print("End.")

input("Show UI?")
runHuman( 25000 )
 

Last edit: iBoxDB 2025-03-11