This notebook demonstrates how grammar-guided genetic programming (G3P) can be used to solve the Pendulum-v0 problem from OpenAI Gym. This is achieved by searching for a small program that defines an agent, who uses an algebraic expression of the observed variables to decide which action to take in each moment.
import time
import warnings
import alogos as al
import gym
import numpy as np
import unified_map as um
warnings.filterwarnings('ignore')
Pendulum-v0: The aim is to swing up a frictionless pendulum and keep it standing upright there, starting from random position and velocity. The agent observes the current position and velocity of the pendulum. It can act by applying limited torque to the joint (continuous value between -2 to +2)
env = gym.make('Pendulum-v0')
It allows an agent to act in an environment and collect rewards until the environment signals it is done.
def simulate_single_run(env, agent, render=False):
observation = env.reset()
episode_reward = 0.0
while True:
action = agent.decide(observation)
observation, reward, done, info = env.step(action)
episode_reward += reward
if render:
time.sleep(0.03)
env.render()
if done:
break
env.close()
return episode_reward
def simulate_multiple_runs(env, agent, n):
total_reward = sum(simulate_single_run(env, agent) for _ in range(n))
mean_reward = total_reward / n
return mean_reward
num_sim = 500
class Agent:
def decide(self, observation):
x, y, angle_velocity = observation
flip = (y < 0.)
if flip:
y *= -1. # now y >= 0
angle_velocity *= -1.
angle = np.arcsin(y)
if x < 0.:
angle = np.pi - angle
if (angle < -0.3 * angle_velocity) or \
(angle > 0.03 * (angle_velocity - 2.5) ** 2. + 1. and \
angle < 0.15 * (angle_velocity + 3.) ** 2. + 2.):
force = 2.
else:
force = -2.
if flip:
force *= -1.
action = np.array([force,])
return action
agent = Agent()
simulate_multiple_runs(env, agent, num_sim)
class Agent:
def decide(self, observation):
x, y, angle_velocity = observation
output = (6.46/((4.45**(5.67/8.42))/((((y-y)*1.50)-(((x/x)/x)*angle_velocity))-((5.40*x)*y))))
action = [min(max(output, -2.0), 2.0)]
return action
agent = Agent()
simulate_multiple_runs(env, agent, num_sim)
class Agent:
def decide(self, observation):
x, y, angle_velocity = observation
output = ((x/(((2.29-4.83)+y)/(angle_velocity+(8.50*(9.86/0.28)))))*(y+angle_velocity))
action = [min(max(output, -2.0), 2.0)]
return action
agent = Agent()
simulate_multiple_runs(env, agent, num_sim)
class Agent:
def decide(self, observation):
x, y, angle_velocity = observation
output = (((((7.05/(x+(6.66/1.04)))-angle_velocity)-((y+y)+x))*3.04)/x)
action = [min(max(output, -2.0), 2.0)]
return action
agent = Agent()
simulate_multiple_runs(env, agent, num_sim)
class Agent:
def decide(self, observation):
x, y, angle_velocity = observation
output = ((((2.05*x)-x)*((x-6.40)-(angle_velocity/y)))/y)
action = [min(max(output, -2.0), 2.0)]
return action
agent = Agent()
simulate_multiple_runs(env, agent, num_sim)
ebnf_text = """
program = L0 NL L1 NL L2 NL L3 NL L4 NL L5
L0 = "class Agent:"
L1 = " def decide(self, observation):"
L2 = " x, y, angle_velocity = observation"
L3 = " output = " EXPR
L4 = " action = [min(max(output, -2.0), 2.0)]"
L5 = " return action"
NL = "\n"
EXPR = VAR | CONST | "(" EXPR OP EXPR ")"
VAR = "x" | "y" | "angle_velocity"
CONST = DIGIT "." DIGIT DIGIT
OP = "+" | "-" | "*" | "/" | "**"
DIGIT = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
"""
grammar = al.Grammar(ebnf_text=ebnf_text)
The objective function gets a candidate solution (=a string of the grammar's language) and returns a fitness value for it. This is done by 1) executing the string as a Python program, so that it creates an agent object, and then 2) using the agent in multiple simulations to see how good it can handle different situations: the higher the total reward, the better is the candidate.
def string_to_agent(string):
local_vars = dict()
exec(string, None, local_vars)
Agent = local_vars['Agent']
return Agent()
def objective_function(string):
agent = string_to_agent(string)
avg_reward = simulate_multiple_runs(env, agent, 15)
return avg_reward
Check if grammar and objective function work as intended.
random_string = grammar.generate_string()
print(random_string)
objective_function(random_string)
ea = al.EvolutionaryAlgorithm(
grammar, objective_function, 'max', max_or_min_fitness=-180,
population_size=50, offspring_size=50, evaluator=um.univariate.parallel.futures, verbose=True)
best_ind = ea.run()
string = best_ind.phenotype
print(string)
agent = string_to_agent(string)
simulate_multiple_runs(env, agent, 100)
simulate_single_run(env, agent, render=True)