# Importing Numpy Library
import numpy as np

# Turn legacy formatting ON
np.set_printoptions(legacy='1.13')

# Setting seed for reproducibility
np.random.seed(1693)

# This represents the environment rewards in an array
rewardList = [
[-1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,  -1.,  -1.],
[-1.,   -20.,  -20.,  -1.,   -1.,   -1.,   -1.,   -1.,  -20., -1.],
[-1.,   -20.,  -1.,   -1.,   -1.,   -1.,   -20.,  -1.,  -1.,  -1.],
[-1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -20.,  -1.,  -20., -1.],
[-1.,   -1.,   -1.,   -20.,  -1.,   -1.,   -20.,  -1.,  -20., -1.],
[-1.,   -1.,   -20.,  -20.,  -1.,   -1.,   -1.,   -1.,  -20., -1.],
[-1.,   -1.,   -1.,   -20.,  -1.,   -1.,   -1.,   -1.,  -1.,  -20.],
[-1.,   -1.,   -1.,   -1.,   -1.,   -20.,  -1.,   -1.,  -1.,  -1.],
[-1.,   -1.,   -20.,  -20.,  -1.,   -20.,  -1.,   -20., -20., -1.],
[-1.,   -1.,   -1.,   -1.,   -1.,   -20.,  -1.,   -20., -20., -1.]
]
map_rewards=np.array(rewardList)

# Create variables needed later
actions = ['up', 'right', 'down', 'left'] # Possible actions the robot can take
WHrows = 10     # warehouse rows
WHcols = 10     # warehouse columns

# Creates a dictionary with a code name for every pair of indices for every grid cell
# The indices are given in tuples as the key and the codename as the value

keys = []

for i in range(0,10):
  for j in range(0,10):
    keys.append((i,j))

values = []

for v in ["A","B","C","D","E","F","G","H","I","J"]:
  for k in range(1,11):
    value = v + str(k)
    values.append(value)

codeNames = dict(zip(keys, values))

# This function finds a random legal row and column to start on while training

# Important to have a variety of starting places so we do not only train our model
# to start from one place -- we want to be able to start from anywhere

def start_pos():
    """
    Select starting legal position for each episode randomly

    Args:

    returns:
          row(int): index of row of start position of robot
          col(int): index of column of start position of robot
    """
    row = np.random.randint(WHrows)
    col = np.random.randint(WHcols)
    while map_rewards[row, col] != -1.:
        row = np.random.randint(WHrows)
        col = np.random.randint(WHcols)
    return row, col

# This function takes the row/col of either the start_pos() or current state

# Then generates the next action using the Epsilon Greedy Policy
# The Epsilon Greedy Policy deals with Exploitation vs. Exploration
# If the random number is greater than epsilon, we will do exploitation.
# It means that the Agent will take the action with the highest value given a state.

def action_next(row, col, exploration_rate):
    """
    Select next action based on Epsilon Greedy Policy

    Args:
          row(int): index of row
          col(int): index of column
          greedy_eps: The value of epsilon

    returns:
          Action: The index of next action (ex: 'up' would be 0)
    """
    if np.random.random() > exploration_rate: #Epsilon
        return np.argmax(table_Q[row, col])
    else: #choose a random action
        return np.random.randint(4)

# This function gets the new state after the action from action_next() was generated
# Remember--the state means which cell the robot is in

def location_next(row, col, action_index):
    """
    Computes the indices of new row and column after taking the action

    Args:
          row(int): index of row
          col(int): index of column
          action_index(int): index of action

    returns:
          new_row(int): index of new row after taking the action
          new_column(int): index of new column after taking the action
    """
    new_row = row
    new_column = col
    if actions[action_index] == 'up' and row > 0:
        new_row -= 1
    elif actions[action_index] == 'right' and col < WHcols - 1:
        new_column += 1
    elif actions[action_index] == 'down' and row < WHrows - 1:
        new_row += 1
    elif actions[action_index] == 'left' and col > 0:
        new_column -= 1
    return new_row, new_column

# This function will train our Q-Table (initialized inside the function)
# The Q-Table is initialized inside the function because we want a fresh Q-Table
# every time we run the function

# This function was created so we can tune our hyperparameters more easily

def train(ending_cell_position):
  """
  Trains the model Q-Table

  Args:
        ending_cell_position: Ending cell label (ex: 'A10')

  returns:
        mean_reward(float): the mean reward of all the episodes
        std_reward(float): the standard deviation reward of all the episodes
  """
  # This part converts the cell codename given in the argument to the row and column
  keys_end = [k for k, v in codeNames.items() if v == ending_cell_position]
  end_row_index, end_column_index = keys_end[0]

  # This will put the reward value for the goal (100) into the part of the rewards array
  # representing the cell given in the argument
  global map_rewards_new
  map_rewards_new = np.copy(map_rewards)
  map_rewards_new[end_row_index, end_column_index] = 100

  # This part initialized the Q-Table and makes sure it is accessible for other functions
  # See Part 2 above
  global table_Q
  table_Q = np.zeros((WHrows, WHcols, 4))

  # This empty list will hold the total rewards for each episode for the entire training
  episode_rewards = []

  # This is where the training begins
  for episode in range(num_episodes):

    # See Part 1 above on Epsilon Decay
    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

    # Get an initial random start position for the robot
    row_index, column_index = start_pos()

    # This will count the total rewards in 1 episode
    total_rewards_ep = 0

    # This creates a loop that represents every step the robot takes within the episode
    # It will end if the robot reaches the goal or runs into an obstacle
    while map_rewards_new[row_index, column_index] == -1.:
        # Gets next action based on current state
        action_index = action_next(row_index, column_index, exploration_rate)

        # Gets next state based on action from above
        row_old, column_old = row_index, column_index
        row_index, column_index = location_next(row_index, column_index, action_index)

        # Calculates the reward for the new state
        reward = map_rewards_new[row_index, column_index]

        # Updating Q-Table (see Part 2 above)
        # Also see Part 3 for Alpha and Gamma Explanations
        q_old = table_Q[row_old, column_old, action_index]
        # Temporal_difference (TD) - used to estimate the expected value of Q(St+1, a)
        # by using the current state and action and previous state and action.
        TD = reward + (gamma * np.max(table_Q[row_index, column_index])) - q_old
        q_new = q_old + (alpha * TD)
        table_Q[row_old, column_old, action_index] = q_new

        # Add rewards for this step to episode total
        total_rewards_ep += reward

    # Add total rewards for this episode to list
    episode_rewards.append(total_rewards_ep)

  # Calculate the mean and std for all episodes rewards for review
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

# This function tests our Agent with the Q-Table that was trained with the previous function

def evaluate_agent(n_eval_episodes, starting_cell_position):
  """
  Tests to see the performance of the Agent with the trained Q-Table

  Args:
        starting_cell_position: Starting cell label (ex: 'D4')
        n_eval_episodes: The number of testing episodes

  returns:
        mean_reward(float): the mean reward of all the episodes
        std_reward(float): the standard deviation reward of all the episodes
  """
  # This part converts the cell codename given in the argument to the row and column
  keys_start = [k for k, v in codeNames.items() if v == starting_cell_position]
  start_row_index, start_column_index = keys_start[0]

  # This checks if the starting cell position is valid
  if map_rewards_new[start_row_index, start_column_index] != -1.:
    print('Invalid starting position')
  else:
    # This empty list will hold the total rewards for each episode for the entire testing
    episode_rewards = []

    # This is where the testing begins
    for episode in range(n_eval_episodes):

      # Get an initial start position for the robot from start position above
      row, col = start_row_index, start_column_index

      # This will count the total rewards in 1 episode
      total_rewards_ep = 0

      # This creates a loop that represents every step the robot takes within the episode
      # It has max_steps in case you want to limit how many steps per episode
      for step in range(max_steps):

        # It will end if the robot reaches the goal or runs into an obstacle
        while map_rewards_new[row, col] == -1.:

          # Gets next action based on current state (see note below about 0 value)
          action_index = action_next(row, col, 0.)

          # Gets next state based on action from above
          row, col = location_next(row, col, action_index)

          # Calculates the reward for the new state
          reward = map_rewards_new[row, col]

          # Add rewards for this step to episode total
          total_rewards_ep += reward

      # Add total rewards for this episode to list
      episode_rewards.append(total_rewards_ep)

    # Calculate the mean and std for all episodes rewards for review
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

# This function converts the states from indices to the codenames

def get_codeNames_path(shortest_path):
    """
    The reverse mapping from indices of locations to the code names is done.

    Args:
          shortest_path(List): its a list of tuples containing indices

    returns:
          pathList: List of locations in final path in the form of code names
    """
    pathList = []
    for i in shortest_path:
        j=tuple(i)
        pathList.append(codeNames[j])
    return pathList

# This function brings everything together that we have learned so far
# Almost all of it we have seen in other functions, so I will not comment every line

def final_path(starting_cell_position, ending_cell_position):
    """
    Computes/prints final path of Agent/Robot

    Args:
          starting_cell_position: Starting cell label (ex: 'D4')
          ending_cell_position: Ending cell label (ex: 'A10')

    returns:
          pathList(NoneType): List of locations in final path in the form of code names
    """
    keys_start = [k for k, v in codeNames.items() if v == starting_cell_position]
    start_row_index, start_column_index = keys_start[0]

    keys_end = [k for k, v in codeNames.items() if v == ending_cell_position]
    end_row_index, end_column_index = keys_end[0]

    global map_rewards_new
    map_rewards_new = np.copy(map_rewards)
    map_rewards_new[end_row_index, end_column_index] = 100

    global table_Q

    table_Q = np.zeros((WHrows, WHcols, 4))

    for episode in range(num_episodes):

      exploration_rate = min_exploration_rate + \
      (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

      row_index, column_index = start_pos()

      while map_rewards_new[row_index, column_index] == -1.:
        action_index = action_next(row_index, column_index, exploration_rate)

        row_old, column_old = row_index, column_index
        row_index, column_index = location_next(row_index, column_index, action_index)

        reward = map_rewards_new[row_index, column_index]

        q_old = table_Q[row_old, column_old, action_index]
        # temporal_difference
        TD = reward + (gamma * np.max(table_Q[row_index, column_index])) - q_old
        q_new = q_old + (alpha * TD)
        table_Q[row_old, column_old, action_index] = q_new

    # Creates a list object for the suggested path
    pathList=[]
    if map_rewards_new[start_row_index, start_column_index] != -1.:
        print('Invalid starting position')
    else:
        row, col = start_row_index, start_column_index
        # This holds the row/col list of the suggested path
        shortest_path = []
        shortest_path.append([row, col])
        while map_rewards_new[row, col] == -1.:
            action_index = action_next(row, col, 0.)
            row, col = location_next(row, col, action_index)
            shortest_path.append([row, col])
            # This converts the rows/cols into the codenames for user ease
            pathList=get_codeNames_path(shortest_path)
        # This creates a nice looking print statement
        for i in pathList:
          if i != pathList[-1]:
            print(i, end =" => ")
          else:
            print(i)

# This function is almost identical to the previous--it only changes the return statement to a list
# We want the return statement to be a list so we can concatenate two later

# Almost all of it we have seen in other functions, so I will not comment every line

def pre_extra_stop(starting_cell_position, ending_cell_position):
    """
    Computes/prints final path of Agent/Robot (used within extra stop function)

    Args:
          starting_cell_position: Starting cell label (ex: 'D4')
          ending_cell_position: Ending cell label (ex: 'A10')


    returns:
          pathList(List): List of locations in final path in the form of code names
    """
    keys_start = [k for k, v in codeNames.items() if v == starting_cell_position]
    start_row_index, start_column_index = keys_start[0]

    keys_end = [k for k, v in codeNames.items() if v == ending_cell_position]
    end_row_index, end_column_index = keys_end[0]

    global map_rewards_new
    map_rewards_new = np.copy(map_rewards)
    map_rewards_new[end_row_index, end_column_index] = 100

    global table_Q

    table_Q = np.zeros((WHrows, WHcols, 4))

    for episode in range(num_episodes):
      exploration_rate = min_exploration_rate + \
      (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
      row_index, column_index = start_pos()

      while map_rewards_new[row_index, column_index] == -1.:
        action_index = action_next(row_index, column_index, exploration_rate)

        row_old, column_old = row_index, column_index
        row_index, column_index = location_next(row_index, column_index, action_index)

        reward = map_rewards_new[row_index, column_index]
        q_old = table_Q[row_old, column_old, action_index]
        # temporal_difference
        TD = reward + (gamma * np.max(table_Q[row_index, column_index])) - q_old
        q_new = q_old + (alpha * TD)
        table_Q[row_old, column_old, action_index] = q_new

    pathList=[]
    if map_rewards_new[start_row_index, start_column_index] != -1.:
        print('Invalid starting position')
    else:
        row, col = start_row_index, start_column_index
        shortest_path = []
        shortest_path.append([row, col])
        while map_rewards_new[row, col] == -1.:
            action_index = action_next(row, col, 0.)
            row, col = location_next(row, col, action_index)
            shortest_path.append([row, col])
            pathList=get_codeNames_path(shortest_path)
        # This is where we just return the list
        return pathList

# This function allows for one more argument of a middle stop in the path

# We use the previous function so we can concatenate the two lists together to make
# a final path

def extra_stop(starting_location, intermediary_location, ending_location):
    """
    Computes/prints final path of Agent/Robot with extra stop

    Args:
          starting_cell_position: Starting cell label (ex: 'D4')
          intermediary_location: intermediary cell label (ex: 'D6')
          ending_cell_position: Ending cell label (ex: 'A10')

    returns:
          pathList(NoneType): List of locations in final path in the form of code names
    """
    # Here we run the pre_extra_stop() function twice with the intermediary_location
    # in the middle. We also use [1:] so the intermediary_location is not listed twice
    final = pre_extra_stop(starting_location, intermediary_location) + pre_extra_stop(intermediary_location, ending_location)[1:]
    for i in final:
      if i != final[-1]:
        print(i, end =" => ")
      else:
        print(i)

# These are two parameters that could be tuned for the testing

# This represents the number of episodes it will train
n_eval_episodes = 100

# This represents the max steps the robot can take in 1 episode
# This is not needed as much for this demo, but it could be useful if it was larger
max_steps = 100

# These are the main hyperparameters we will be tuning
# In particular, num_episodes, alpha, exploration_decay_rate, and gamma

num_episodes = 5000

alpha = 0.1 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001 # decay rate

train('E10')

(48.941000000000003, 56.095517815597354)

evaluate_agent(n_eval_episodes, 'H1')

(83.0, 0.0)

num_episodes = 10000

alpha = 0.1 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001 # decay rate

train('E10')

(67.131600000000006, 46.710117548985039)

evaluate_agent(n_eval_episodes, 'H1')

(85.0, 0.0)

num_episodes = 1000

alpha = 0.1 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01 # decay rate

train('E10')

(54.226999999999997, 57.524077315503291)

evaluate_agent(n_eval_episodes, 'H1')

(85.0, 0.0)

num_episodes = 1000

alpha = 0.1 # learning_rate
gamma = 0.7 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01 # decay rate

train('E10')

(50.228000000000002, 60.133651277799522)

evaluate_agent(n_eval_episodes, 'H1')

(85.0, 0.0)

num_episodes = 1000

alpha = 0.5 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01 # decay rate

train('E10')

(59.908999999999999, 52.762796732167253)

evaluate_agent(n_eval_episodes, 'H1')

(85.0, 0.0)

num_episodes = 1000

alpha = 0.99 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01 # decay rate

train('E10')

(61.021999999999998, 51.91371221556016)

evaluate_agent(n_eval_episodes, 'H1')

(85.0, 0.0)

# Number of times we want to run our final function

num_tests = 100

# This function takes the input of num_tests to check the path length each time
# the final function pre_extra_stop is run (I chose that function since you can count in lists)

# The dictionary then sorts and counts each type of length that comes from function
# pre_extra_stop during the tests

def my_test_function(num_tests, starting_cell_position, ending_cell_position):
  """
    Checks the distribution of suggested path lengths by running pre_extra_stop()
    num_tests amount of times

    Args:
          num_tests: Number of times to run pre_extra_stop() function
          starting_cell_position: Starting cell label (ex: 'D4')
          ending_cell_position: Ending cell label (ex: 'A10')

    returns:
          dict(dictionary): Dictionary of path lengths with lengths as key and
                            frequency as value
  """
  list = []

  for i in range(num_tests+1):
    list.append(len(pre_extra_stop(starting_cell_position, ending_cell_position)))

  dict = {i:list.count(i) for i in list}

  return dict

num_episodes = 1000

alpha = 0.99 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01 # decay rate

my_test_function(num_tests, 'H1', 'E10')

{19: 38, 17: 60, 21: 3}

num_episodes = 10000

alpha = 0.99 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001 # decay rate

my_test_function(num_tests, 'H1', 'E10')

{17: 93, 19: 8}

num_episodes = 2500

alpha = 0.99 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01 # decay rate

my_test_function(num_tests, 'H1', 'E10')

{17: 72, 19: 26, 21: 3}

num_episodes = 2500

alpha = 0.99 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001 # decay rate

my_test_function(num_tests, 'H1', 'E10')

{17: 82, 19: 19}

# These are the tuned hyperparameters that were decided earlier

num_episodes = 2500

alpha = 0.99 # learning_rate
gamma = 0.99 # discount_rate

exploration_rate = 1 # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001 # decay rate

final_path('D4', 'A10')

D4 => C4 => B4 => A4 => A5 => A6 => A7 => A8 => A9 => A10

final_path('D1', 'E10')

D1 => D2 => D3 => C3 => C4 => B4 => B5 => B6 => B7 => B8 => C8 => C9 => C10 => D10 => E10

extra_stop('D4', 'D6', 'A10')

D4 => D5 => D6 => C6 => B6 => A6 => A7 => A8 => A9 => A10

final_path('H1', 'E10')

H1 => H2 => H3 => H4 => H5 => G5 => F5 => F6 => F7 => F8 => E8 => D8 => C8 => C9 => C10 => D10 => E10

extra_stop('H1', 'H7', 'E10')

H1 => H2 => H3 => H4 => H5 => G5 => G6 => G7 => H7 => G7 => F7 => F8 => E8 => D8 => C8 => C9 => C10 => D10 => E10

Coding Demo of how to optimize a mobile robot's path in a warehouse environment with reinforcement learning¶

The Goal¶

Why would you use Reinforcement Learning?¶

Import Libraries and Set Seed¶

Understanding the Environment -- Warehouse Layout Visual¶

Initial Setup and Defining Functions Part 1¶

Deeper Dive Explanations¶

Part 1 - Epsilon Decay¶

Part 2 - Updating Q-Table¶

Part 3 - Alpha and Gamma¶

Defining Final Functions Part 2¶

Training the model and Hyperparameter Tuning¶

Seeing how Hyperparameters affect model¶

Test 1 (a)¶

What did we learn?¶

Test 2 (a)¶

What did we learn?¶

Test 3 (a)¶

What did we learn?¶

Test 4 (a)¶

What did we learn?¶

Test 5 (a)¶

What did we learn?¶

Test 6 (a)¶

What did we learn?¶

Exploring Consistency¶

Defining Function and setting up for tests¶

Running Tests on Consistency¶

Test 1 (b)¶

What did we learn?¶

Test 2 (b)¶

What did we learn?¶

Test 3 (b)¶

What did we learn?¶

Test 4 (b)¶

What did we learn?¶

Running the Functions and Visuals¶

How does this tie back into our problem?¶

Here is what it looks like when running the function!¶

Main Sources for this Demo (see presentation for full citation list):¶