from config import (np, plt, print_qtable, Maze, QLearner, plot_q_table, 
                    plot_greedy_policy, tf, e_trajectory,
                    ClassicalDDPG, trainer, plot_training_log, run_correction)
%matplotlib inline


!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: OK

Reinforcement learning examples

UZH & Intel Labs, 2023: Drone racing

Training in simulations with mixed-in residual models from real data
RL agent beats human drone racing champions in real environment

Paper


env = Maze(height=3, width=5)
env.plot(title='Initial state');


# Take some actions
env.plot(title='Initial state')
env.step(action='up')
env.plot()
env.step(action='right')
env.plot();


np.random.seed(123457)

env = Maze(height=3, width=2) # FILL HERE)
env.plot(title='Initial state')

all_actions = ['up', 'down', 'left', 'right'] # ... FILL HERE]

done = False
while not done:
    action = np.random.choice(all_actions)
    state, action, reward, new_state, done = env.step(action)
    env.plot();


np.random.seed(0)

# Initialize small maze environment
env = Maze(width=2, height=2, fire_positions=[[1, 0]])
_ = env.plot(add_player_position=False)

# Initialize Q-learner with Q-table
qtable_learner = QLearner(env, q_function='table')

print('Initial Q-table')
q_table = qtable_learner.q_func.get_q_table()
print_qtable(q_table)

Initial Q-table
+--------+-----+------+------+-------+
| s \ a  |  up | down | left | right |
+--------+-----+------+------+-------+
| (0, 0) | 0.0 | 0.0  | 0.0  |  0.0  |
| (0, 1) | 0.0 | 0.0  | 0.0  |  0.0  |
| (1, 0) | 0.0 | 0.0  | 0.0  |  0.0  |
| (1, 1) | 0.0 | 0.0  | 0.0  |  0.0  |
+--------+-----+------+------+-------+


qtable_learner.train(200)

print('Q-table after 200 episodes')
q_table = qtable_learner.q_func.get_q_table()
print_qtable(q_table)

  0%|          | 0/200 [00:00<?, ?it/s]

Q-table after 200 episodes
+--------+------+------+------+-------+
| s \ a  |  up  | down | left | right |
+--------+------+------+------+-------+
| (0, 0) | 19.9 | 7.8  | 7.9  |  7.1  |
| (0, 1) | 14.8 | 11.0 | 14.1 |  26.6 |
| (1, 0) | 22.0 | 5.0  | 8.7  |  4.4  |
| (1, 1) | 0.0  | 0.0  | 0.0  |  0.0  |
+--------+------+------+------+-------+


qtable_learner.train(300)

print('Q-table after 500 episodes')
q_table = qtable_learner.q_func.get_q_table()
print_qtable(q_table)

  0%|          | 0/300 [00:00<?, ?it/s]

Q-table after 500 episodes
+--------+------+------+------+-------+
| s \ a  |  up  | down | left | right |
+--------+------+------+------+-------+
| (0, 0) | 28.3 | 22.0 | 22.0 |  17.4 |
| (0, 1) | 23.8 | 25.8 | 24.1 |  29.9 |
| (1, 0) | 28.8 | 15.9 | 23.8 |  15.7 |
| (1, 1) | 0.0  | 0.0  | 0.0  |  0.0  |
+--------+------+------+------+-------+


qtable_learner.plot_training_evolution()


np.random.seed(0)

env = Maze(width=2, height=2, fire_positions=[[1, 0]])

qtable_learner = QLearner(env, q_function='table')
qtable_learner.train(2000) # FILL HERE)
qtable_learner.plot_training_evolution()

  0%|          | 0/2000 [00:00<?, ?it/s]


# Exercise 4 a)
np.random.seed(123456)

env = Maze(width=4, # FILL HERE,
           height=3, # FILL HERE,
           fire_positions=[[2, 1], [2, 2]]) # FILL HERE)

qtable_learner = QLearner(env, q_function='table') # FILL HERE)
qtable_learner.train(5000) # FILL HERE)

  0%|          | 0/5000 [00:00<?, ?it/s]


# In case the training does not work for you for some reason
# you can reload the qtable from a trained agent from file.
# Don't forget to initialize the env as suggested in the
# exercise ...
# Note that the q evolution history of training is not saved
# and will hence not be displayed when reloading from file.

# qtable_learner.q_func.load_q_table('saved_agents/qtable_ex4.json')


# Exercise 4 b)
q_table = qtable_learner.q_func.get_q_table()
ax = env.plot(add_player_position=False, title=False)
plot_q_table(q_table, env.target_position, env.fire_positions, ax=ax)


# Exercise 4 c)
policy = qtable_learner.q_func.get_greedy_policy()
ax = env.plot(add_player_position=False, title=False)
plot_greedy_policy(policy, env.target_position, env.fire_positions, ax=ax)


np.random.seed(123456)

# Env definition
env = Maze(width=4, height=3, fire_positions=[[2, 1], [2, 2]], fire_reward=-2) # FILL HERE)
qtable_learner = QLearner(env, q_function='table')
qtable_learner.train(500)

# If you have issues with the training, please comment out the
# line qtable_learner.train(5000) above and reload instead the
# qtable by uncommenting the following line.

# qtable_learner.q_func.load_q_table('saved_agents/qtable_ex5.json')

  0%|          | 0/500 [00:00<?, ?it/s]


# Show Q-values
q_table = qtable_learner.q_func.get_q_table()
ax = env.plot(add_player_position=False, title=False)
plot_q_table(q_table, env.target_position, env.fire_positions, ax=ax)


# Show policy
policy = qtable_learner.q_func.get_greedy_policy()
ax = env.plot(add_player_position=False, title=False)
plot_greedy_policy(policy, env.target_position, env.fire_positions, ax=ax)


tf.keras.utils.set_random_seed(0)

env = Maze(width=4, height=3, fire_positions=[[2, 1], [2, 2]])
qnet_learner = QLearner(env, q_function='net') # FILL HERE)
qnet_learner.train(1500) # FILL HERE)

  0%|          | 0/1500 [00:00<?, ?it/s]


# Again, if you face any issues with model training, please use
# the saved q-net weights of a trained agent by uncommenting
# the following. You can comment the line for training in the
# previous cell.

# qnet_learner.q_func.load_model('saved_agents/qnet_ex6')


q_table = qnet_learner.q_func.get_q_table()
ax = env.plot(add_player_position=False, title=False)
plot_q_table(q_table, env.target_position, env.fire_positions, ax=ax)


policy = qnet_learner.q_func.get_greedy_policy()
ax = env.plot(add_player_position=False, title=False)
plot_greedy_policy(policy, env.target_position, env.fire_positions, ax=ax)


# Exercise 7 a)
tf.keras.utils.set_random_seed(12345)

env = e_trajectory()
agent = ClassicalDDPG(state_space=env.observation_space, action_space=env.action_space)

env.reset(init_outside_threshold=True)
env.plot_trajectory()


# Exercise 7 b)
run_correction(env, agent)


# Exercise 7 b)
run_correction(env, agent)


# Exercise 7 c)
training_log = trainer(env=env, agent=agent, n_steps=500)

  0%|          | 0/500 [00:00<?, ?it/s]

EPISODE: 0, INITIAL REWARD: -134.26, FINAL REWARD: -73.393, #STEPS: 1.
EPISODE: 50, INITIAL REWARD: -96.326, FINAL REWARD: -49.14, #STEPS: 1.
EPISODE: 100, INITIAL REWARD: -116.591, FINAL REWARD: -38.265, #STEPS: 1.
EPISODE: 150, INITIAL REWARD: -323.567, FINAL REWARD: -75.531, #STEPS: 1.
EPISODE: 200, INITIAL REWARD: -27.811, FINAL REWARD: -60.445, #STEPS: 1.
EPISODE: 250, INITIAL REWARD: -48.02, FINAL REWARD: -57.575, #STEPS: 1.


plot_training_log(env, agent, training_log)


# In case you face any issues with the training of the agent,
# avoid executing the previous cell and use instead the 
# following line to load pre-trained weights.

# agent.load_actor_critic_weights('saved_agents/ddpg_ex7')


# Exercise 7 d)
run_correction(env, agent)

Numerical Methods in Accelerator Physics

Lecture Series by Dr. Adrian Oeftiger

Guest Lecture by Dr. Michael Schenk

Lecture 12

Run this notebook online!

Run this first!

Refresher!

Today!

Disclaimer

Part I: Introduction to Reinforcement Learning

Machine learning landscape

Reinforcement learning examples

Reinforcement learning examples

Reinforcement learning examples

Reinforcement learning examples

Reinforcement learning examples

What is reinforcement learning?

An example: Pacman

Another example: beam trajectory steering

Today's lecture

Our environment

RL definitions

RL is about taking the best decisions ...

Part II: Reinforcement Learning Formalism

Markov process

Markov reward process

Markov decision process (MDP)

Policy $\pi$

RL objective

RL taxonomy

Intermediate summary

Part III: Q-learning

Q-learning

Obtaining the optimal policy

How to implement Q-learning?

Some challenges

Exploration-exploitation trade-off

Q-learning with lookup table

Deep Q-learning (DQN)

Q-table vs DQN: pros, cons, and limitations

Part IV: Actor-critic Methods

Actor-critic scheme

Application in accelerator physics

AWAKE electron beam line

RL task definitions

Summary

Comprehension questions I

Comprehension questions II

Literature

Python RL libraries