from config import (np, plt)
from scipy.constants import m_p, e, c
%matplotlib inline


from qlearning.core import Maze

env = Maze(height=3, width=5)
env.plot(title='Initial state');


# Take some actions
env.plot(title='Initial state')
env.step(action='up')
env.plot()
env.step(action='right')
env.plot();


from qlearning.core import Maze

np.random.seed(123457)

env = Maze(height=3, width=2) # FILL HERE)
env.plot(title='Initial state')

all_actions = ['up', 'left', 'down', 'right'] # ... FILL HERE]

done = False
while not done:
    action = np.random.choice(all_actions)
    state, action, reward, new_state, done = env.step(action)
    env.plot();


from qlearning.plot_utils import print_qtable
from qlearning.core import Maze, QLearner

np.random.seed(0)

# Initialize small maze environment
env = Maze(width=2, height=2, fire_positions=[[1, 0]])
_ = env.plot(add_player_position=False)

# Initialize Q-learner with Q-table
qtable_learner = QLearner(env, q_function='table')

print('Initial Q-table')
q_table = qtable_learner.q_func.get_q_table()
print_qtable(q_table)

Initial Q-table
+--------+-----+------+------+-------+
| s \ a  |  up | down | left | right |
+--------+-----+------+------+-------+
| (0, 0) | 0.0 | 0.0  | 0.0  |  0.0  |
| (0, 1) | 0.0 | 0.0  | 0.0  |  0.0  |
| (1, 0) | 0.0 | 0.0  | 0.0  |  0.0  |
| (1, 1) | 0.0 | 0.0  | 0.0  |  0.0  |
+--------+-----+------+------+-------+


qtable_learner.train(200)

print('Q-table after 200 episodes')
q_table = qtable_learner.q_func.get_q_table()
print_qtable(q_table)

  0%|          | 0/200 [00:00<?, ?it/s]

Q-table after 200 episodes
+--------+------+------+------+-------+
| s \ a  |  up  | down | left | right |
+--------+------+------+------+-------+
| (0, 0) | 19.9 | 7.8  | 7.9  |  7.1  |
| (0, 1) | 14.8 | 11.0 | 14.1 |  26.6 |
| (1, 0) | 22.0 | 5.0  | 8.7  |  4.4  |
| (1, 1) | 0.0  | 0.0  | 0.0  |  0.0  |
+--------+------+------+------+-------+


qtable_learner.train(300)

print('Q-table after 500 episodes')
q_table = qtable_learner.q_func.get_q_table()
print_qtable(q_table)

  0%|          | 0/300 [00:00<?, ?it/s]

Q-table after 500 episodes
+--------+------+------+------+-------+
| s \ a  |  up  | down | left | right |
+--------+------+------+------+-------+
| (0, 0) | 28.3 | 22.0 | 22.0 |  17.4 |
| (0, 1) | 23.8 | 25.8 | 24.1 |  29.9 |
| (1, 0) | 28.8 | 15.9 | 23.8 |  15.7 |
| (1, 1) | 0.0  | 0.0  | 0.0  |  0.0  |
+--------+------+------+------+-------+


qtable_learner.plot_training_evolution()


from qlearning.plot_utils import print_qtable
from qlearning.core import Maze, QLearner

np.random.seed(0)

env = Maze(width=2, height=2, fire_positions=[[1, 0]])

qtable_learner = QLearner(env, q_function='table')
qtable_learner.train(1500) # FILL HERE)
qtable_learner.plot_training_evolution()

  0%|          | 0/1500 [00:00<?, ?it/s]


# Exercise 4 a)
from qlearning.core import Maze, QLearner

np.random.seed(123456)

env = Maze(width=4, # FILL HERE,
           height=3, # FILL HERE,
           fire_positions=[[2, 1], [2, 2]]) # FILL HERE)

qtable_learner = QLearner(env, q_function='table') # FILL HERE)
qtable_learner.train(5000) # FILL HERE)

  0%|          | 0/5000 [00:00<?, ?it/s]


# In case the training does not work for you for some reason
# you can reload the qtable from a trained agent from file.
# Don't forget to initialize the env as suggested in the
# exercise ...
# Note that the q evolution history of training is not saved
# and will hence not be displayed when reloading from file.

# qtable_learner.q_func.load_q_table('saved_agents/qtable_ex4.json')


# Exercise 4 b)
from qlearning.plot_utils import plot_q_table

q_table = qtable_learner.q_func.get_q_table()
ax = env.plot(add_player_position=False, title=False)
plot_q_table(q_table, env.target_position, env.fire_positions, ax=ax)


# Exercise 4 c)
from qlearning.plot_utils import plot_greedy_policy

policy = qtable_learner.q_func.get_greedy_policy()
ax = env.plot(add_player_position=False, title=False)
plot_greedy_policy(policy, env.target_position, env.fire_positions, ax=ax)


from qlearning.core import Maze, QLearner
from qlearning.plot_utils import plot_q_table, plot_greedy_policy

np.random.seed(123456)

# Env definition
env = Maze(width=4, height=3, fire_positions=[[2, 1], [2, 2]], fire_reward=-2) # FILL HERE)
qtable_learner = QLearner(env, q_function='table')
qtable_learner.train(500)

# If you have issues with the training, please comment out the
# line qtable_learner.train(5000) above and reload instead the
# qtable by uncommenting the following line.

# qtable_learner.q_func.load_q_table('saved_agents/qtable_ex5.json')

  0%|          | 0/500 [00:00<?, ?it/s]


# Show Q-values
q_table = qtable_learner.q_func.get_q_table()
ax = env.plot(add_player_position=False, title=False)
plot_q_table(q_table, env.target_position, env.fire_positions, ax=ax)


# Show policy
policy = qtable_learner.q_func.get_greedy_policy()
ax = env.plot(add_player_position=False, title=False)
plot_greedy_policy(policy, env.target_position, env.fire_positions, ax=ax)


from qlearning.core import Maze, QLearner
import tensorflow as tf

tf.keras.utils.set_random_seed(0)

env = Maze(width=4, height=3, fire_positions=[[2, 1], [2, 2]])
qnet_learner = QLearner(env, q_function='net') # FILL HERE)
qnet_learner.train(1500) # FILL HERE)

  0%|          | 0/1500 [00:00<?, ?it/s]


# Again, if you face any issues with model training, please use
# the saved q-net weights of a trained agent by uncommenting
# the following. You can comment the line for training in the
# previous cell.

# qnet_learner.q_func.load_model('saved_agents/qnet_ex6')


from qlearning.plot_utils import plot_q_table, plot_greedy_policy

q_table = qnet_learner.q_func.get_q_table()
ax = env.plot(add_player_position=False, title=False)
plot_q_table(q_table, env.target_position, env.fire_positions, ax=ax)


policy = qnet_learner.q_func.get_greedy_policy()
ax = env.plot(add_player_position=False, title=False)
plot_greedy_policy(policy, env.target_position, env.fire_positions, ax=ax)


# Exercise 7 a)
from actor_critic.awake_env import e_trajectory
from actor_critic.core import ClassicalDDPG, trainer, plot_training_log, run_correction
import tensorflow as tf

tf.keras.utils.set_random_seed(12345)

env = e_trajectory()
agent = ClassicalDDPG(state_space=env.observation_space, action_space=env.action_space)

env.reset(init_outside_threshold=True)
env.plot_trajectory()


# Exercise 7 b)
run_correction(env, agent)


# Exercise 7 c)
training_log = trainer(env=env, agent=agent, n_steps=500)

  0%|          | 0/500 [00:00<?, ?it/s]

EPISODE: 0, INITIAL REWARD: -131.397, FINAL REWARD: -109.155, #STEPS: 1.
EPISODE: 50, INITIAL REWARD: -308.231, FINAL REWARD: -74.137, #STEPS: 5.
EPISODE: 100, INITIAL REWARD: -214.77, FINAL REWARD: -47.199, #STEPS: 1.
EPISODE: 150, INITIAL REWARD: -349.32, FINAL REWARD: -8.142, #STEPS: 1.
EPISODE: 200, INITIAL REWARD: -117.818, FINAL REWARD: -60.099, #STEPS: 1.
EPISODE: 250, INITIAL REWARD: -52.63, FINAL REWARD: -5.303, #STEPS: 1.


plot_training_log(env, agent, training_log)


# In case you face any issues with the training of the agent,
# avoid executing the previous cell and use instead the 
# following line to load pre-trained weights.

# agent.load_actor_critic_weights('saved_agents/ddpg_ex7')


# Exercise 7 d)
run_correction(env, agent)

Numerical Methods of Accelerator Physics

Lecture Series by Dr. Adrian Oeftiger

Guest Lecture by Dr. Michael Schenk

Part 11: 20.01.2023

Run this notebook online!

Run this first!

Refresher!

Preparation!

Today!

Disclaimer

Part I: introduction to reinforcement learning

Machine learning landscape

Reinforcement learning examples

Reinforcement learning examples

Reinforcement learning examples

Reinforcement learning examples

Reinforcement learning examples

What is reinforcement learning?

An example: Pacman

Another example: beam trajectory steering

Today's lecture

Our environment

RL definitions

RL is about taking the best decisions ...

Part II: reinforcement learning formalism

Markov process

Markov reward process

Markov decision process (MDP)

Policy $\pi$

RL objective

RL taxonomy

Intermediate summary

Part III: Q-learning

Q-learning

Obtaining the optimal policy

How to implement Q-learning?

Some challenges

Exploration-exploitation trade-off

Q-learning with lookup table

Deep Q-learning (DQN)

Q-table vs DQN: pros, cons, and limitations

Part IV: actor-critic methods

Actor-critic scheme

Application in accelerator physics

AWAKE electron beam line

RL task definitions

Summary

Comprehension questions

Literature

Python RL libraries