# FIT MLP NETWORK USING SIMPLE GRADIENT ASCENT.  The arguments are a vector
# of 0/1 responses for training cases, a matrix of inputs for training cases,
# two learning rates (for parameters on connections to hidden units and the
# output unit), the number of iterations of gradient ascent to do.  
#
# These arguments are followed by either be an argument, q, giving the number 
# of hidden units, or a vector of initial values for parameters (from which q 
# can be deduced).  If no initial parameter values are supplied, values are 
# randomly chosen independently from Gaussian distributions with mean zero and
# standard deviation 0.01, except that the bias for the output unit is set to 
# the logit of the fraction of training cases that are in class 1.  
#
# The result returned is a list with element ll giving the log likelihood
# for each iteration, element p1 giving the matrix of probabilities for
# class 1 for each iteration (rows) and training case (columns), and element
# params giving the values of the network parameters for each iteration (rows).
# Note that these results are for states after each iteration; the initial 
# values are not included.
#
# Note that an initial value can be supplied via the init argument that is the
# params element from the result of a previous call of mlp.train.

mlp.train = function (y, X, eta1, eta2, iters, q=NULL, init=NULL)
{
  n = nrow(X)
  p = ncol(X)

  if (is.null(q))
  { q = as.integer((length(init)-1) / (p+2))
  }

  M = (q+1) + q*(p+1)

  if (is.null(init))
  { init = c (log(mean(y)/(1-mean(y))), rnorm(M-1,0,0.01) )
  }

  if (length(init)!= M)
  { stop("Initial parameter vector is the wrong length")
  }

  eta = c (rep(eta2,q+1), rep(eta1,q*(p+1)) )
  
  ll = rep(NA,iters)
  params = matrix(NA,iters,M)
  p1 = matrix(NA,iters,n)

  current = init

  fw = mlp.forward(X,q,current)

  for (iter in 1:iters)
  { 
    bk = mlp.backward(y,X,q,current,fw)
    gr = mlp.grad(X,q,fw,bk)

    current = current + eta*gr

    params[iter,] = current

    fw = mlp.forward(X,q,current)
    p1[iter,] = 1/(1+exp(-fw$o))
    ll[iter] = mlp.log.likelihood(y,fw$o)
  }
  
  list (ll=ll, p1=p1, params=params)
}


# FORWARD NETWORK COMPUTATIONS.  Given the matrix of inputs, the number of
# hidden units, and the network parameters, this function returns a list with
# a matrix of hidden unit inputs (s) for all training cases, a corresponding 
# matrix of hidden unit values (h), and a vector of output unit values (o).

mlp.forward = function (X, q, params)
{
  n = nrow(X)
  p = ncol(X)

  if (length(params) != (q+1) + q*(p+1)) 
  { stop("Parameter vector is the wrong length")
  }
 
  beta0 = params[1]
  beta = params[2:(q+1)]
  gamma0 = params[(q+2):(2*q+1)]
  gamma0m = matrix(gamma0,n,q,byrow=T)
  gamma = matrix(params[(2*q+2):length(params)],p,q)

  s = X %*% gamma + gamma0m
  h = tanh(s)
  o = h %*% beta + beta0

  list(s=s, h=h, o=o)
}


# BACKWARD NETWORK COMPUTATIONS.  Given the training targets and inputs, the 
# number of hidden units, the network parameters, and the results of the 
# forward computation, this function returns a list of vectors and matrices of
# partial derivatives with respect to unit values, for each training case.

mlp.backward = function (y, X, q, params, fw)
{
  beta = params[2:(q+1)]

  p1 = 1/(1+exp(-fw$o))

  dl.do = y-p1
  dl.dh = dl.do %*% beta
  dl.ds = (1-fw$h^2) * dl.dh

  list (dl.do=dl.do, dl.dh=dl.dh, dl.ds=dl.ds)
}


# COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS.  Takes
# as arguments the matrix of inputs, the number of hidden units, and the 
# results of the forward and backward computations.  Returns a vector of 
# partial derivatives of the log likelihood with respect to the parameters.

mlp.grad = function (X, q, fw, bk)
{
  p = ncol(X)

  dl.dbeta0 = sum (bk$dl.do)
  dl.dbeta = t(fw$h) %*% bk$dl.do
  dl.dgamma0 = apply (bk$dl.ds, 2, sum)
  dl.dgamma = matrix(NA,p,q)
  for (j in 1:p)
  { dl.dgamma[j,] = X[,j] %*% bk$dl.ds
  }

  c (dl.dbeta0=dl.dbeta0, dl.dbeta=dl.dbeta, dl.dgamma0=dl.dgamma0, 
     dl.dgamma=dl.dgamma)
}


# COMPUTE THE LOG LIKELIHOOD GIVEN THE NETWORK OUTPUTS.  

mlp.log.likelihood = function (y, o)
{
  sum(-log(1+exp(-(2*y-1)*o)))
}


# MAKE PREDICTIONS FOR TEST CASES.  Takes the matrix of test inputs and the
# network parameters as inputs, and returns the vector of probabilities for
# class 1 in the test cases.

mlp.predict = function (X, params)
{ 
  p = ncol(X)
  q = as.integer( (length(params)-1) / (p+2) )
  fw = mlp.forward(X,q,params)
  1 / (1+exp(-fw$o))
}


# TRAIN NETWORK USING EARLY STOPPING.  Takes as arguments the training targets
# and inputs, two learning rates (for parameters to hidden units and to the
# output), the indexes of the validation set, the number of iterations of 
# gradient descent to do, the number of hidden units, and a TRUE/FALSE flag
# saying whether or not to produce a plot (default FALSE).

mlp.cv = function (y, X, eta1, eta2, val.set, iters, q, cv.plot=F)
{ 
  est.set = setdiff(1:length(y),val.set)

  r = mlp.train (trny[est.set], trnx[est.set,], eta1, eta2, iters, q)
  
  vpr = rep(NA,length(r$ll))
  for (i in 1:length(r$ll))
  { p = mlp.predict(trnx[val.set,],r$params[i,])
    vpr[i] = sum (log(p[trny[val.set]==1])) + sum (log(1-p[trny[val.set]==0]))
  }
  
  best = order(-vpr)[1]

  if (cv.plot) 
  { plot (r$ll/length(est.set), type="l", col="blue", 
          xlim=c(0,iters), xlab=paste("iteration, best at ",best), 
          ylim=range(c(r$ll/length(est.set),vpr/length(val.set))),
          ylab="average log probability (blue:estimation, red:validation)")
    lines (vpr/length(val.set), col="red")
  }

  r$params[best,]
}