flappy-es/es.py
2018-12-07 10:48:13 +03:30

74 lines
2.6 KiB
Python

import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class EvolutionStrategy():
# fn: function that plays the game and returns the rewards, this function must take as argument another function that
# determines whether the bird should jump or not
# e.g. def fn shouldJump = if shouldJump(some_input): ... return reward
# noisep: noise population, how many different noises should be tried at each step
# sigma: standard deviation of generated noise
# alpha: learning rate
# layer_sizes: sizes of neural network layers, e.g. [[4, 500], [500, 1]]
# input_size: number of inputs
def __init__(self, fn, noisep, sigma, alpha, layer_sizes, input_size):
self.fn = fn
self.sigma = sigma
self.noisep = noisep
self.alpha = alpha
self.layer_sizes = layer_sizes
self.input_size = input_size
# initialize layers randomly
self.layers = []
for i, layer in enumerate(layer_sizes):
self.layers.append(np.random.uniform(-0.1, 0.1, layer))
# forward propagation: sigmoid(xW) for every layer
def forward(self, input):
output = input
for i, layer in enumerate(self.layers):
output = sigmoid(np.dot(output, layer))
return output
# train the model
def train(self):
N = [[] for i in range(len(self.layers))]
R = np.zeros(self.noisep)
for i in range(self.noisep):
noisy_layers = []
for j, (layer_size, layer) in enumerate(zip(self.layer_sizes, self.layers)):
# for each layer, generate a noise
n = np.random.randn(*layer_size)
N[j].append(n)
# add noise to layer
noisy_w = layer + self.sigma * n
noisy_layers.append(noisy_w)
# generate another network with the same parameters, but with noisy layers
es = EvolutionStrategy(fn=self.fn, noisep=self.noisep, sigma=self.sigma, alpha=self.alpha, layer_sizes=self.layer_sizes, input_size=self.input_size)
es.layers = noisy_layers
# run a forward propagation using the noisy layer and save the reward
R[i] = self.fn(es.forward)
# normalize the rewards
A = (R - np.mean(R)) / np.std(R)
# update layers
for n, i in zip(N, range(len(self.layers))):
n = np.array(n)
# np.dot(n.T, A) scales each noise's contribution to the update by how much reward it had received
update = self.alpha / (self.noisep * self.sigma) * np.dot(n.T, A).T
self.layers[i] = self.layers[i] + update
return R