# UPDATE THE TABLE OF THE Q FUNCTION.  Arguments are a state (s), the action 
# taken in this state (a), the reward obtained the next time step (r), the new 
# state entered (sn), the old Q table, the discount to apply to rewards (gamma),
# the learning rate (alpha), and the probability of taking a random action
# (epsilon).  Returns the updated Q table.

Qlearn = function (s, a, r, sn, Q, gamma, alpha, epsilon)
{
  Q[s,a] = (1-alpha) * Q[s,a] + 
    alpha * (r + gamma * (epsilon*mean(Q[sn,]) + (1-epsilon)*max(Q[sn,])))
  Q
}


# SIMULATE Q-LEARNING IN SOME WORLD.  Arguments are a function generating an
# initial state of the world (init), the function defining the transitions
# and rewards (world), the discount rate for rewards (gamma), the learning 
# rate (alpha), the probability of taking a random action (epsilon), and the 
# number of steps to simulate (steps).  
#
# The global variables n.states and n.actions should contain the number of
# states (labelled from 1 up ) and the number of actions (labelled from 1 up).
#
# Returns a list containing a matrix with the history of the simulation, one 
# row per time step, and the final Q function table.

simulate = function (init, world, gamma, alpha, epsilon, steps)
{
  history = matrix(NA,steps,6)
  colnames(history) = c("t","s","a","r","rs","sn")

  Q = matrix(0,n.states,n.actions)

  s = init()

  for (t in 1:steps)
  { 
    if (runif(1)<epsilon)
    { a = sample(n.actions,1)
    }
    else
    { a = order(Q[s,])[n.actions]
    }

    w = world(s,a)
    r = w$r
    sn = w$s

    Q = Qlearn(s,a,r,sn,Q,gamma,alpha,epsilon)

    history[t,"t"] = t
    history[t,"s"] = s
    history[t,"a"] = a
    history[t,"r"] = r
    history[t,"sn"] = sn

    s = sn
  }

  history[,"rs"] = history[,"r"]
  for (t in 2:steps)
  { history[t,"rs"] = 0.1*history[t,"rs"] + 0.9*history[t-1,"rs"]
  }

  list (history=history, Q=Q)
}


# PLOT INFORMATION FROM HISTORY.  Produces three plots, of states, actions,
# and reward over time (with exponentially smoothed reward as well).  
# A suitable par(mfrow=...) or par(mfcol=...) should be done before calling 
# this function.

hplot = function (history)
{
  plot (history[,"t"],history[,"s"],pch=20,xlab="time step",ylab="state")

  plot (history[,"t"],history[,"a"],pch=20,xlab="time step",ylab="action")

  plot (history[,"t"],history[,"r"],pch=20,
     xlab="time step",ylab="reward / smoothed reward")
  lines (history[,"t"],history[,"rs"], col="gray")
}