Reinforcement Sort

You know Quicksort and Merge sort and Timsort and Shell sort
Heapsort and Bubble sort and Insertion sort and Selection sort
But do you recall the most god-awful sort of all?

A Reinforcement Sort (a sort done through reinforcement(ish) learning) is the Nickelback of sorting algorithms - a perfect superposition of horrible and awesome. This rockstar was developed using an epsilon greedy Q-learning approach with the aim of generating the best python sorting script. This mainly involved combining code fragments together and giving rewards based on how well these scripts sort a list. The exact setup is summarized as follows:

1) : A binary array chad-kroegered from a md5 hash of the python code string

def oh_god_why(code):
    This converts the code string (state) to a binary array (for the neural net)

    state_hash = hashlib.md5(code)
    state_int = int(state_hash.hexdigest(), 16)
    state_bin = bin(state_int)
    state_str = str(state_bin)[2:].zfill(ARRAY_INPUT_SIZE)
    state_array = np.array(list(state_str))

    return state_array.reshape(1, ARRAY_INPUT_SIZE)

2) : I figured sorting will require some looping, swapping and probably a conditional or two, so I made these actions on a state

from itertools import product
import hashlib
import numpy as np


class Action(object):
    def __init__(self, code):
        self.code = code
        self.start_list = 'unsorted_list'

    def _lookup_element(self, var):
        return "{list}[{var}]".format(**{
            'list': self.start_list,
            'var': var

    def loop_through_elements(self, var, _):
        self.code += "for {var}, _ in enumerate({list}):\n\t".format(**{
            'list': self.start_list,
            'var': var,

    def swap(self, var1, var2):
        self.code += "{ele1}, {ele2} = {ele2}, {ele1};".format(**{
            'ele1': self._lookup_element(var1),
            'ele2': self._lookup_element(var2)

    def compare_elements(self, var1, var2):
        self.code += "\tif( {ele1} < {ele2} ):".format(**{
            'ele1': self._lookup_element(var1),
            'ele2': self._lookup_element(var2)

def generate_actions():
    Gives all the unhidden methods (possible actions) from the Action class
    actions = Action.__dict__.keys()
    actions = filter(lambda method: not method.startswith('_'), actions)
    vars = ['i', 'j']
    possible_actions = [e for e in product(actions, vars, vars) if e[1] != e[2]]

    return possible_actions

def performAction(state, action):
    Takes the current state (the code string) and applies an action
    a = Action(state)
    getattr(a, action[0])(*action[1:])
    return a.code

3) : The state receives a score of -1 if the python code doesn't run, -10 is the code is over 300 characters or +1000 if the list is correctly sorted.

import random


def sort(unsorted_list):
    return unsorted_list
sorted_list = sort({1})

def append_definition(code, start_list):
    This takes the above TEMPLATE string and wraps it around the code string
    return TEMPLATE.format(code, start_list)

def calc_reward(code):
    Rewards based on the current code state:
        1) Code correctly sorts the list: 1000
        2) Code runs: 0
        3) Code doesn't run: -1
        4) Code is over 300 characters: -10
    if len(code) > 300:
        return -10

        exec append_definition(code, START_LIST)

        score = 0
        if sorted_list == range(LIST_LENGTH):
            score = 1000

        score = -1

    return score

After up a neural net, I received this optimal sort:

import random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
from action import performAction, oh_god_why, generate_actions, ARRAY_INPUT_SIZE
from reward import calc_reward

EPOCHS = 20000
GAMMA = 0.9

def setup_network(num_actions):
    This defines a neural net since that's the cool thing to do nowadays
    model = Sequential()
    model.add(Dense(200, input_shape=(ARRAY_INPUT_SIZE,)))



    rms = RMSprop()
    model.compile(loss='mse', optimizer=rms)

    return model

def learn_best_sort(actions):
    Reinforcement learning via an epsilon greedy strategy
    model = setup_network(len(actions))
    epsilon = 1

    for epoch in range(EPOCHS):
        if epoch % 250 == 0:
            print 'Epoch = ', epoch

        state = ''
        while (True):
            q = model.predict(oh_god_why(state))

            if (random.random() < epsilon):
                action = np.random.randint(0, len(actions))
                action = (np.argmax(q))

            updated_state = performAction(state, actions[action])
            reward = calc_reward(updated_state)

            new_q = model.predict(oh_god_why(updated_state))
            q_max = np.max(new_q)

            if reward == -1:
                update = reward + (GAMMA * q_max)
                update = reward

            q[0][action] = update
  , q, verbose=0, nb_epoch=1)
            state = updated_state

            if reward != -1:

        if epoch < EPOCHS - 1:
            epsilon -= 1.0 / EPOCHS
            print state

if __name__ == '__main__':
    actions = generate_actions()


Now you may be telling yourself that this looks exactly like a Bubble Sort, but I assure you it's not. A Bubble Sort has a reasonable time complexity of O(n^2), but a Reinforcement Sort runs in O(oh god, why am I doing this).