Every time I need to do something in R it nearly always means I have to do a Google search or trawl my previous code to see how I did it before. Here I am going to post some snippets of code - mainly for my own use so that I know where to find them. Much of this code will be 'borrowed' and probably not the most efficient (I like to write code the long way so I can follow what is going on) - but it seems to work. If anyone finds it doesn't work or there is a more efficient way then please let me know.
1. Randomly sampling data into a train and test set
totalrecords <- nrow(mydata)
trainfraction = 0.7
trainrecords = as.integer(totalrecords * trainfraction)
allrows <- 1:totalrecords
trainrows <- sample(totalrecords,trainrecords)
testrows <- allrows[-trainrows]
#check
length(trainrows)
length(testrows)
#then build model, something like...
model <- lm(theFormula, data=mydata[trainrows,])
1a. Randomly sampling data into a train and test set
Thanks to Isamoor
trainrows <- runif(nrow(mydata)) > 0.7
testrows <- !trainrows
2. Repeated n fold cross validation
This is to generate a cross-validation set, useful when wanting to know the expected error or for generating a set to use for getting ensemble weightings.
######################################
# the error function
calc_RMSE <- function(act,pred){
aact <- as.matrix(act)
ppred <- as.matrix(pred)
if(nrow(aact) == nrow(ppred)){
return (sqrt(sum(((ppred) - (aact)) ^ 2) / nrow(aact)))
} else {
return (-99)
}
}
#####################################
###########################
#Load and prepare data
###########################
databuild <- iris
datascore <- iris #put real score set here
#target - what we are predicting
theTarget <- 'Sepal.Length'
#set the formula
theFormula <- as.formula(paste(theTarget," ~ . "))
#find the position of the target
targindex <- which(names(databuild)==theTarget)
#actuals
build_actuals <- databuild[,targindex]
#######################################
#vectors to score the model outputs
#######################################
buildcases <- nrow(databuild)
scorecases <- nrow(datascore)
pred_train <- vector(length=buildcases)
pred_test <- vector(length=buildcases)
pred_score <- vector(length=scorecases)
pred_trainLoop <- vector(length=buildcases)
pred_testLoop <- vector(length=buildcases)
pred_scoreLoop <- vector(length=scorecases)
#settings
numloops <- 300
numfolds <- 10
test_errors <- vector(length=numloops)
train_errors <- vector(length=numloops)
pred_testLoop <- 0
pred_trainLoop <- 0
pred_scoreLoop <- 0
modtype = 'linear regression'
#####################################
# now the work
#####################################
for(loop in 1:numloops){
# generate the indicies for each fold
id <- sample(rep(seq_len(numfolds), length.out=buildcases))
# lapply over them:
indicies <- lapply(seq_len(numfolds), function(a) list(
test = which(id==a),
train = which(id!=a)
))
#reset the predictions for this loop
pred_train <- 0
pred_test <- 0
pred_score <- 0
for(fold in 1:numfolds){
#set the cases for this fold
rows_train <- indicies[[fold]]$train
rows_test <- indicies[[fold]]$test
#build the models - use any model
model <- lm(theFormula, data=databuild[rows_train,])
#score up the model
buildPred <- predict(model, databuild, type="response")
scorepred <- predict(model, datascore, type="response")
#now score the cv and scoring predictions
z <- buildPred
z[rows_test] <- 0
pred_train <- pred_train + z
pred_test[rows_test] <- buildPred[rows_test]
pred_score <- pred_score + scorepred
} #next fold
#average the predictions on the train set
pred_train <- pred_train / (numfolds - 1)
pred_score <- pred_score / numfolds
#add to previous loop results
pred_trainLoop <- pred_trainLoop + pred_train
pred_testLoop <- pred_testLoop + pred_test
pred_scoreLoop <- pred_scoreLoop + pred_score
#calculate the errors
train_errors[loop] <- calc_RMSE(build_actuals,pred_trainLoop / loop)
test_errors[loop] <- calc_RMSE(build_actuals,pred_testLoop / loop)
#report
cat("\nloop = ",loop,"train error = ",train_errors[loop],"cv error = ",test_errors[loop])
#plot a chart as we go
if(loop>1){
plot(test_errors[1:loop],col='blue',type='l',main = paste(modtype,numloops,'by',numfolds,'-fold cross validation'), xlab = 'Repetitions', ylab = 'RMSE',ylim = range(rbind(test_errors[1:loop],train_errors[1:loop])))
abline(h=test_errors[loop],col='blue')
points(train_errors[1:loop],type='l',col='red')
abline(h=train_errors[loop],col='red')
legend('top',c('test','train'),col=c('blue','red'),lty=1)
}
} #loop
########################
#the cross validation predictions and scoring set predictions
#this is what we are after
cvPredictions <- pred_testLoop / numloops
scPredictions <- pred_scoreLoop / numloops
#plot should show decreasing test error with increasing train error
plot(train_errors,test_errors,type='p')