# THE K NEAREST NEIGHBOR METHOD. Applies the k-NN method using the values of # k in the kvec argument (a vector of length K) to predict the response for # the test cases with inputs x_test (an m by p matrix), using the training # cases with inputs x_train (an n by p matrix) and responses y_train (a vector # of length n). The result is an m by K matrix of predictions for the test # cases, with columns containing predictions found using the various values # of k in kvec. # # The x_test argument can be a vector, in which case it is converted to a # matrix with one row. knn = function (kvec, x_train, y_train, x_test) { # Convert x_test to a matrix with one row if it's just a vector (ie, if # there's just one test case). if (!is.matrix(x_test)) { x_test = matrix(x_test,nrow=1) } # Check that numbers of cases and inputs are compatibile. if (nrow(x_train)!=length(y_train)) { stop( "Number of training cases for inputs doesn't match number for responses") } if (ncol(x_train)!=ncol(x_test)) { stop( "Number of inputs for training cases doesn't match number for test case") } # Allocate some variables. n.train = nrow(x_train) n.test = nrow(x_test) p.test = matrix (NA, n.test, length(kvec)) # Holds predictions for each k dsq = numeric(n.train) # Holds distances to each training case # Find predictions for each test case, for each k. for (tst in 1:n.test) { # Find squared distances from this test case to each training case. for (trn in 1:n.train) { dsq[trn] = sum ((x_train[trn,]-x_test[tst,])^2) } # Order indexes of training cases by distance to test case. ord = order(dsq) # Make predictions using the various values of k. for (i in 1:length(kvec)) { p.test[tst,i] = mean(y_train[ord[1:kvec[i]]]) } } # Return predictions for test cases. p.test }