Initial commit
This commit is contained in:
12
Divers/renforcement2/CartPole_common.py
Normal file
12
Divers/renforcement2/CartPole_common.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import numpy as np
|
||||
|
||||
# Valeurs hautes et basses des observations
|
||||
low_values=np.array([-5, -5, -0.45, -5])
|
||||
high_values=np.array([5, 5, 0.45, 5])
|
||||
|
||||
division=[42, 42, 42, 42]
|
||||
pas=(high_values-low_values)/division
|
||||
|
||||
def discretise(state):
|
||||
discrete_state=(state-low_values)/pas
|
||||
return tuple(discrete_state.astype(np.int))
|
||||
25
Divers/renforcement2/CartPole_predict.py
Normal file
25
Divers/renforcement2/CartPole_predict.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import gym
|
||||
import numpy as np
|
||||
import CartPole_common
|
||||
|
||||
env=gym.make("CartPole-v0")
|
||||
env._max_episode_steps=5000
|
||||
|
||||
q_table=np.load("CartPole_qtable.npy")
|
||||
|
||||
for epoch in range(1000):
|
||||
state = env.reset()
|
||||
score = 0
|
||||
while True:
|
||||
env.render()
|
||||
discrete_state=CartPole_common.discretise(state)
|
||||
action=np.argmax(q_table[discrete_state])
|
||||
#if not np.random.randint(5):
|
||||
# action=np.random.randint(2)
|
||||
state, reward, done, info=env.step(action)
|
||||
score+=reward
|
||||
if done:
|
||||
print('Essai {:05d} Score: {:04d}'.format(epoch, int(score)))
|
||||
break
|
||||
|
||||
env.close()
|
||||
82
Divers/renforcement2/CartPole_train.py
Normal file
82
Divers/renforcement2/CartPole_train.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import gym
|
||||
import numpy as np
|
||||
import cv2
|
||||
import CartPole_common
|
||||
|
||||
env=gym.make("CartPole-v0")
|
||||
env._max_episode_steps=500
|
||||
|
||||
alpha=0.05
|
||||
gamma=0.98
|
||||
|
||||
epoch=50000
|
||||
show_every=500
|
||||
|
||||
epsilon=1.
|
||||
epsilon_min=0.05
|
||||
start_epsilon=1
|
||||
end_epsilon=epoch//2
|
||||
epsilon_decay_value=epsilon/(end_epsilon-start_epsilon)
|
||||
|
||||
nbr_action=env.action_space.n
|
||||
q_table=np.random.uniform(low=-1, high=1, size=(CartPole_common.division+[nbr_action]))
|
||||
|
||||
result_done=0
|
||||
scores=[]
|
||||
best_score=0
|
||||
for episode in range(epoch):
|
||||
obs=env.reset()
|
||||
discrete_state=CartPole_common.discretise(obs)
|
||||
done=False
|
||||
|
||||
if episode%show_every == 0:
|
||||
render=True
|
||||
mean_score=np.mean(scores)
|
||||
print("Epoch {:06d}/{:06d} reussite:{:04d}/{:04d} epsilon={:06.4f} Mean score={:08.4f} alpha={:06.4f}".format(episode, epoch, result_done, show_every, epsilon, mean_score, alpha))
|
||||
scores=[]
|
||||
result_done=0
|
||||
if mean_score>best_score:
|
||||
print("Sauvegarde ...")
|
||||
np.save("CartPole_qtable", q_table)
|
||||
best_score=mean_score
|
||||
alpha=alpha*0.99
|
||||
|
||||
else:
|
||||
render=False
|
||||
|
||||
score=1
|
||||
while not done:
|
||||
|
||||
if np.random.random()>epsilon:
|
||||
action=np.argmax(q_table[discrete_state])
|
||||
else:
|
||||
action=np.random.randint(nbr_action)
|
||||
|
||||
new_state, reward, done, info=env.step(action)
|
||||
new_discrete_state=CartPole_common.discretise(new_state)
|
||||
|
||||
if episode%show_every == 0:
|
||||
env.render()
|
||||
|
||||
#reward=2-np.abs(new_state[0])
|
||||
if done:
|
||||
scores.append(score)
|
||||
if score==env._max_episode_steps:
|
||||
result_done+=1
|
||||
else:
|
||||
reward=-10
|
||||
|
||||
max_future_q=np.max(q_table[new_discrete_state])
|
||||
current_q=q_table[discrete_state][action]
|
||||
new_q=(1-alpha)*current_q+alpha*(reward+gamma*max_future_q)
|
||||
q_table[discrete_state][action]=new_q
|
||||
|
||||
score+=1
|
||||
discrete_state=new_discrete_state
|
||||
|
||||
if end_epsilon>=episode>=start_epsilon:
|
||||
epsilon-=epsilon_decay_value
|
||||
if epsilon<epsilon_min:
|
||||
epsilon=epsilon_min
|
||||
|
||||
env.close()
|
||||
12
Divers/renforcement2/MountainCar_common.py
Normal file
12
Divers/renforcement2/MountainCar_common.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import numpy as np
|
||||
|
||||
# Valeurs hautes et basses des observations
|
||||
low_values =np.array([-1.2, -0.07])
|
||||
high_values=np.array([0.6, 0.07])
|
||||
|
||||
division=[42, 42]
|
||||
pas=(high_values-low_values)/division
|
||||
|
||||
def discretise(state):
|
||||
discrete_state=(state-low_values)/pas
|
||||
return tuple(discrete_state.astype(np.int))
|
||||
19
Divers/renforcement2/MountainCar_predict.py
Normal file
19
Divers/renforcement2/MountainCar_predict.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import gym
|
||||
import numpy as np
|
||||
import MountainCar_common
|
||||
|
||||
env=gym.make("MountainCar-v0")
|
||||
|
||||
q_table=np.load("MountainCar_qtable.npy")
|
||||
|
||||
for epoch in range(1000):
|
||||
state = env.reset()
|
||||
while True:
|
||||
env.render()
|
||||
discrete_state=MountainCar_common.discretise(state)
|
||||
action=np.argmax(q_table[discrete_state])
|
||||
state, reward, done, info=env.step(action)
|
||||
if done:
|
||||
print("Essai {:05d}: {}".format(epoch, "OK" if state[0]>=env.goal_position else "raté ..."))
|
||||
break
|
||||
env.close()
|
||||
68
Divers/renforcement2/MountainCar_train.py
Normal file
68
Divers/renforcement2/MountainCar_train.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import gym
|
||||
import numpy as np
|
||||
import MountainCar_common
|
||||
|
||||
env=gym.make("MountainCar-v0")
|
||||
|
||||
# Coefficient d'apprentissage
|
||||
alpha=0.1
|
||||
# Le "discount rate"
|
||||
gamma=0.98
|
||||
|
||||
epoch=25000
|
||||
show_every=500
|
||||
|
||||
# Politique exploration/exploitation
|
||||
epsilon=1.
|
||||
epsilon_min=0.1
|
||||
start_epsilon=1
|
||||
end_epsilon=epoch//2
|
||||
epsilon_decay_value=epsilon/(end_epsilon-start_epsilon)
|
||||
|
||||
nbr_action=env.action_space.n
|
||||
q_table=np.random.uniform(low=-1, high=1, size=(MountainCar_common.division+[nbr_action]))
|
||||
|
||||
OK=0
|
||||
for episode in range(epoch):
|
||||
obs=env.reset()
|
||||
discrete_state=MountainCar_common.discretise(obs)
|
||||
done=False
|
||||
|
||||
if episode%show_every == 0:
|
||||
render=True
|
||||
print("epoch {:06d}/{:06d} reussite:{:04d}/{:04d} epsilon={:08.6f}".format(episode, epoch, OK, show_every, epsilon))
|
||||
OK=0
|
||||
else:
|
||||
render=False
|
||||
|
||||
while not done:
|
||||
|
||||
if np.random.random()>epsilon:
|
||||
action=np.argmax(q_table[discrete_state])
|
||||
else:
|
||||
action=np.random.randint(nbr_action)
|
||||
|
||||
new_state, reward, done, info=env.step(action)
|
||||
new_discrete_state=MountainCar_common.discretise(new_state)
|
||||
if episode%show_every == 0:
|
||||
env.render()
|
||||
|
||||
if new_state[0]>=env.goal_position:
|
||||
reward=1
|
||||
OK+=1
|
||||
|
||||
# Mise à jour de Q(s, a) avec la formule de Bellman
|
||||
max_future_q=np.max(q_table[new_discrete_state])
|
||||
current_q=q_table[discrete_state][action]
|
||||
new_q=(1-alpha)*current_q+alpha*(reward+gamma*max_future_q)
|
||||
q_table[discrete_state][action]=new_q
|
||||
|
||||
discrete_state=new_discrete_state
|
||||
|
||||
if end_epsilon>=episode>=start_epsilon:
|
||||
epsilon-=epsilon_decay_value
|
||||
if epsilon<epsilon_min:
|
||||
epsilon=epsilon_min
|
||||
|
||||
np.save("MountainCar_qtable", q_table)
|
||||
env.close()
|
||||
6
Divers/renforcement2/README.md
Normal file
6
Divers/renforcement2/README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Apprentissage par renforcement
|
||||
## Equation de Bellman
|
||||
|
||||
La vidéo de ce tutoriel est disponible à l'adresse suivante:<br>
|
||||
https://www.youtube.com/watch?v=4Ak6OyehqJc
|
||||
|
||||
Reference in New Issue
Block a user