Initial commit

This commit is contained in:
2026-03-31 13:28:59 +02:00
commit 7ec43ca17d
314 changed files with 189852 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
# Apprentissage par renforcement
## Méthode 'acteur/critique'
La vidéo de ce tutoriel est disponible à l'adresse suivante:<br>
https://www.youtube.com/watch?v=1okjkEMP79c
Ci dessous, le graph de l'apprentissage sur l'environnement CartPole (https://gym.openai.com/envs/CartPole-v0/)<br>
En bleu: Méthode 'critique'<br>
En orange: Méthode 'acteur'<br>
En vert: Méthode 'acteur/critique'<br>
![image](graph.png)

View File

@@ -0,0 +1,83 @@
import gym
import tensorflow as tf
from tensorflow.keras import models, layers
import numpy as np
import os
env=gym.make("CartPole-v0")
env._max_episode_steps=200
nbr_actions=2
gamma=0.99
max_episode=600
prefix_log_file="log_actor"
id_file=0
while os.path.exists(prefix_log_file+str(id_file)+".csv"):
id_file+=1
fichier_log=open(prefix_log_file+str(id_file)+".csv", "w")
print("Création du fichier de log", prefix_log_file+str(id_file)+".csv")
def model(nbr_inputs, nbr_hidden, nbr_actions):
entree=layers.Input(shape=(nbr_inputs), dtype='float32')
result=layers.Dense(32, activation='relu')(entree)
result=layers.Dense(32, activation='relu')(result)
sortie=layers.Dense(nbr_actions, activation='softmax')(result)
my_model=models.Model(inputs=entree, outputs=sortie)
return my_model
def calcul_discount_rate(rewards_history, gamma, normalize=False):
result=[]
discounted_sum=0
for r in rewards_history[::-1]:
discounted_sum=r+gamma*discounted_sum
result.insert(0, discounted_sum)
# Normalisation
if normalize is True:
result=np.array(result)
result=(result-np.mean(result))/(np.std(result)+1E-7)
result=list(result)
return result
def train():
m_reward=0
for episode in range(max_episode):
tab_rewards=[]
tab_prob_actions=[]
observations=env.reset()
with tf.GradientTape() as tape:
while True:
action_probs=my_model(np.expand_dims(observations, axis=0))
action=np.random.choice(nbr_actions, p=np.squeeze(action_probs))
tab_prob_actions.append(action_probs[0, action])
observations, reward, done, info=env.step(action)
tab_rewards.append(reward)
if done:
break
discount_rate=calcul_discount_rate(tab_rewards, gamma, normalize=True)
loss=-tf.math.log(tab_prob_actions)*discount_rate
gradients=tape.gradient(loss, my_model.trainable_variables)
optimizer.apply_gradients(zip(gradients, my_model.trainable_variables))
score=sum(tab_rewards)
m_reward=0.05*score+(1-0.05)*m_reward
message="Episode {:04d} score:{:6.1f} MPE: {:6.1f}"
print(message.format(episode, score, m_reward))
fichier_log.write("{:f}:{:f}\n".format(score, m_reward))
if m_reward>env._max_episode_steps-10:
print("Fin de l'apprentissage".format(episode))
break
my_model=model(4, 32, nbr_actions)
optimizer=tf.keras.optimizers.Adam(learning_rate=1E-2)
train()
fichier_log.close()

View File

@@ -0,0 +1,103 @@
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
gamma=0.99
max_steps_per_episode=10000
env=gym.make("CartPole-v0")
env._max_episode_steps=200
prefix_log_file="log_actor_critic_dsum_"
id_file=0
while os.path.exists(prefix_log_file+str(id_file)+".csv"):
id_file+=1
fichier_log=open(prefix_log_file+str(id_file)+".csv", "w")
print("Création du fichier de log", prefix_log_file+str(id_file)+".csv")
nbr_actions=2
nbr_inputs=4
def calcul_discount_rate(rewards_history, gamma, normalize=False):
result=[]
discounted_sum=0
for r in rewards_history[::-1]:
discounted_sum=r+gamma*discounted_sum
result.insert(0, discounted_sum)
# Normalisation
if normalize is True:
result=np.array(result)
result=(result-np.mean(result))/(np.std(result)+1E-7)
result=list(result)
return result
def my_model(nbr_inputs, nbr_hidden, nbr_actions):
entree=layers.Input(shape=(nbr_inputs), dtype='float32')
common=layers.Dense(nbr_hidden, activation="relu")(entree)
action=layers.Dense(nbr_actions, activation="softmax")(common)
critic=layers.Dense(1)(common)
model=keras.Model(inputs=entree, outputs=[action, critic])
return model
model=my_model(nbr_inputs, 32, nbr_actions)
optimizer=keras.optimizers.Adam(learning_rate=1E-2)
huber_loss=keras.losses.Huber()
m_reward=0
episode=0
while True:
action_probs_history=[]
critic_value_history=[]
rewards_history=[]
state=env.reset()
episode_reward=0
with tf.GradientTape() as tape:
# Récupération de données
for timestep in range(1, max_steps_per_episode):
action_probs, critic_value=model(np.expand_dims(state, axis=0))
critic_value_history.append(critic_value[0, 0])
action=np.random.choice(nbr_actions, p=np.squeeze(action_probs))
action_probs_history.append(action_probs[0, action])
state, reward, done, infos=env.step(action)
rewards_history.append(reward)
episode_reward+=reward
if done:
break
discount_rate=calcul_discount_rate(rewards_history, gamma, normalize=True)
history=zip(action_probs_history, critic_value_history, discount_rate)
actor_losses=[]
critic_losses=[]
for action_prob, critic_value, discount_rate in history:
actor_losses.append(-tf.math.log(action_prob)*(discount_rate-critic_value))
critic_losses.append(huber_loss([critic_value], [discount_rate]))
loss_value=tf.reduce_mean(actor_losses+critic_losses)
grads=tape.gradient(loss_value, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
episode+=1
m_reward=0.05*episode_reward+(1-0.05)*m_reward
message="Episode {:04d} score:{:6.1f} MPE: {:6.1f}"
print(message.format(episode, episode_reward, m_reward))
fichier_log.write("{:f}:{:f}\n".format(episode_reward, m_reward))
if m_reward>env._max_episode_steps-10:
print("Fin de l'apprentissage".format(episode))
break
fichier_log.close()
model.save("my_model")

View File

@@ -0,0 +1,114 @@
import gym
import tensorflow as tf
from tensorflow.keras import models, layers
import numpy as np
import os
env=gym.make("CartPole-v0")
env._max_episode_steps=200
nbr_action=2
prefix_log_file="log_critic_"
id_file=0
while os.path.exists(prefix_log_file+str(id_file)+".csv"):
id_file+=1
fichier_log=open(prefix_log_file+str(id_file)+".csv", "w")
print("Création du fichier de log", prefix_log_file+str(id_file)+".csv")
gamma=0.98
max_episode=600
epsilon=1.
epsilon_min=0.10
start_epsilon=10
end_epsilon=max_episode
epsilon_decay_value=epsilon/(end_epsilon-start_epsilon)
def model():
entree=layers.Input(shape=(4), dtype='float32')
result=layers.Dense(32, activation='relu')(entree)
result=layers.Dense(32, activation='relu')(result)
sortie=layers.Dense(nbr_action)(result)
model=models.Model(inputs=entree, outputs=sortie)
return model
def my_loss(target_q, predicted_q):
loss=tf.reduce_mean(tf.math.square(target_q-predicted_q))
return loss
@tf.function
def train_step(reward, action, observation, next_observation, done):
next_Q_values=model(next_observation)
best_next_actions=tf.math.argmax(next_Q_values, axis=1)
next_mask=tf.one_hot(best_next_actions, nbr_action)
next_best_Q_values=tf.reduce_sum(next_Q_values*next_mask, axis=1)
target_Q_values=reward+(1-done)*gamma*next_best_Q_values
target_Q_values=tf.reshape(target_Q_values, (-1, 1))
mask=tf.one_hot(action, nbr_action)
with tf.GradientTape() as tape:
all_Q_values=model(observation)
Q_values=tf.reduce_sum(all_Q_values*mask, axis=1, keepdims=True)
loss=my_loss(target_Q_values, Q_values)
gradients=tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
def train(debug=False):
global epsilon
m_reward=0
for episode in range(max_episode):
score=0
tab_observations=[]
tab_rewards=[]
tab_actions=[]
tab_next_observations=[]
tab_done=[]
observations=env.reset()
score=0
while True:
tab_observations.append(observations)
if np.random.random()>epsilon:
valeurs_q=model(np.expand_dims(observations, axis=0))
action=int(tf.argmax(valeurs_q[0], axis=-1))
else:
action=np.random.randint(0, nbr_action)
observations, reward, done, info=env.step(action)
score+=reward
tab_actions.append(action)
tab_next_observations.append(observations)
tab_done.append(done)
if done:
tab_rewards.append(-10.)
break
tab_rewards.append(reward)
tab_rewards=np.array(tab_rewards, dtype=np.float32)
tab_actions=np.array(tab_actions, dtype=np.int32)
tab_observations=np.array(tab_observations, dtype=np.float32)
tab_next_observations=np.array(tab_next_observations, dtype=np.float32)
tab_done=np.array(tab_done, dtype=np.float32)
train_step(tab_rewards, tab_actions, tab_observations, tab_next_observations, tab_done)
train_loss.reset_states()
epsilon-=epsilon_decay_value
epsilon=max(epsilon, epsilon_min)
m_reward=0.05*score+(1-0.05)*m_reward
message="Episode {:04d} score:{:6.1f} MPE: {:6.1f} (epsilon={:5.3f})"
print(message.format(episode, score, m_reward, epsilon))
fichier_log.write("{:f}:{:f}\n".format(score, m_reward))
if m_reward>env._max_episode_steps-10:
print("Fin de l'apprentissage".format(episode))
break
model=model()
optimizer=tf.keras.optimizers.Adam(learning_rate=1E-2)
train_loss=tf.keras.metrics.Mean()
tab_s=[]
train()
fichier_log.close()

Binary file not shown.

After

Width:  |  Height:  |  Size: 128 KiB