Reinforcement Sort - randomlyunique

Sep 15, 2016

You know Quicksort and Merge sort and Timsort and Shell sort
Heapsort and Bubble sort and Insertion sort and Selection sort
But do you recall the most god-awful sort of all?

A Reinforcement Sort (a sort done through reinforcement(ish) learning) is the Nickelback of sorting algorithms - a perfect horrible and awesome superposition. This rockstar was developed using an epsilon greedy Q-learning approach with the aim of generating the best python sorting script. This mainly involved combining code fragments together and giving rewards based on how well these scripts sort a list. The exact setup is summarized as follows:

1) : A binary array chad-kroegered from a md5 hash of the python code string

import hashlib
import numpy as np

ARRAY_INPUT_SIZE = 128


def oh_god_why(code):
    '''
    This converts the code string (state) to a binary array (for the neural net)
    '''

    state_hash = hashlib.md5(code)
    state_int = int(state_hash.hexdigest(), 16)
    state_bin = bin(state_int)
    state_str = str(state_bin)[2:].zfill(ARRAY_INPUT_SIZE)
    state_array = np.array(list(state_str))

    return state_array.reshape(1, ARRAY_INPUT_SIZE)

2) : I figured sorting will require some looping, swapping and probably a conditional or two, so I made these actions on a state

from itertools import product

class Action(object):
    def __init__(self, code):
        self.code = code
        self.start_list = 'unsorted_list'

    def _lookup_element(self, var):
        return "{list}[{var}]".format(**{
            'list': self.start_list,
            'var': var
        })

    def loop_through_elements(self, var, _):
        self.code += "for {var}, _ in enumerate({list}):\n\t".format(**{
            'list': self.start_list,
            'var': var,
        })

    def swap(self, var1, var2):
        self.code += "{ele1}, {ele2} = {ele2}, {ele1};".format(**{
            'ele1': self._lookup_element(var1),
            'ele2': self._lookup_element(var2)
        })

    def compare_elements(self, var1, var2):
        self.code += "\tif( {ele1} < {ele2} ):".format(**{
            'ele1': self._lookup_element(var1),
            'ele2': self._lookup_element(var2)
        })


def generate_actions():
    '''
    Gives all the unhidden methods (possible actions) from the Action class
    '''
    actions = Action.__dict__.keys()
    actions = filter(lambda method: not method.startswith('_'), actions)
    vars = ['i', 'j']
    possible_actions = [e for e in product(actions, vars, vars) if e[1] != e[2]]

    return possible_actions

def performAction(state, action):
    '''
    Takes the current state (the code string) and applies an action
    '''
    a = Action(state)
    getattr(a, action[0])(*action[1:])
    return a.code

3) : The state receives a score of -1 if the python code doesn't run, -10 is the code is over 300 characters or +1000 if the list is correctly sorted.

import random

LIST_LENGTH = 1000
START_LIST = range(LIST_LENGTH)
random.shuffle(START_LIST)

TEMPLATE = '''
def sort(unsorted_list):
    {0}
    return unsorted_list
sorted_list = sort({1})
'''


def append_definition(code, start_list):
    """
    This takes the above TEMPLATE string and wraps it around the code string
    """
    return TEMPLATE.format(code, start_list)


def calc_reward(code):
    """
    Rewards based on the current code state:
        1) Code correctly sorts the list: 1000
        2) Code runs: 0
        3) Code doesn't run: -1
        4) Code is over 300 characters: -10
    """
    if len(code) > 300:
        return -10

    try:
        exec append_definition(code, START_LIST)

        score = 0
        if sorted_list == range(LIST_LENGTH):
            score = 1000

    except:
        score = -1

    return score

After up a neural net, I received this optimal sort:

import random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
from action import performAction, oh_god_why, generate_actions, ARRAY_INPUT_SIZE
from reward import calc_reward

EPOCHS = 20000
GAMMA = 0.9


def setup_network(num_actions):
    '''
    This defines a neural net since that's the cool thing to do nowadays
    '''
    model = Sequential()
    model.add(Dense(200, input_shape=(ARRAY_INPUT_SIZE,)))
    model.add(Activation('relu'))

    model.add(Dense(200))
    model.add(Activation('relu'))

    model.add(Dense(num_actions))
    model.add(Activation('linear'))

    rms = RMSprop()
    model.compile(loss='mse', optimizer=rms)

    return model


def learn_best_sort(actions):
    '''
    Reinforcement learning via an epsilon greedy strategy
    '''
    model = setup_network(len(actions))
    epsilon = 1

    for epoch in range(EPOCHS):
        if epoch % 250 == 0:
            print 'Epoch = ', epoch

        state = ''
        while (True):
            q = model.predict(oh_god_why(state))

            if (random.random() < epsilon):
                action = np.random.randint(0, len(actions))
            else:
                action = (np.argmax(q))

            updated_state = performAction(state, actions[action])
            reward = calc_reward(updated_state)

            new_q = model.predict(oh_god_why(updated_state))
            q_max = np.max(new_q)

            if reward == -1:
                update = reward + (GAMMA * q_max)
            else:
                update = reward

            q[0][action] = update
            model.fit(oh_god_why(state), q, verbose=0, nb_epoch=1)
            state = updated_state

            if reward != -1:
                break

        if epoch < EPOCHS - 1:
            epsilon -= 1.0 / EPOCHS
        else:
            print state


if __name__ == '__main__':
    actions = generate_actions()
    learn_best_sort(actions)

Now you may be telling yourself that this looks exactly like a Bubble Sort, but I assure you it's not. A Bubble Sort has a reasonable time complexity of O(n^2), but a Reinforcement Sort runs in O(oh god, why am I doing this).