Another Data Mining Blog: January 2012

Every time I need to do something in R it nearly always means I have to do a Google search or trawl my previous code to see how I did it before. Here I am going to post some snippets of code - mainly for my own use so that I know where to find them. Much of this code will be 'borrowed' and probably not the most efficient (I like to write code the long way so I can follow what is going on) - but it seems to work. If anyone finds it doesn't work or there is a more efficient way then please let me know.

1. Randomly sampling data into a train and test set
totalrecords <- nrow(mydata)
trainfraction = 0.7
trainrecords = as.integer(totalrecords * trainfraction)
allrows <- 1:totalrecords
trainrows <- sample(totalrecords,trainrecords)
testrows <- allrows[-trainrows]
#check
length(trainrows)
length(testrows)
#then build model, something like...
model <- lm(theFormula, data=mydata[trainrows,])

1a. Randomly sampling data into a train and test set
Thanks to Isamoor
trainrows <- runif(nrow(mydata)) > 0.7
testrows <- !trainrows

2. Repeated n fold cross validation

This is to generate a cross-validation set, useful when wanting to know the expected error or for generating a set to use for getting ensemble weightings.

######################################
# the error function
calc_RMSE <- function(act,pred){
    
    aact <- as.matrix(act)
    ppred <- as.matrix(pred)
    
    if(nrow(aact) == nrow(ppred)){ 
    return (sqrt(sum(((ppred) - (aact)) ^ 2) / nrow(aact)))
    } else {
    return (-99)
    }

}
#####################################


###########################
#Load and prepare data
###########################
databuild <- iris
datascore <- iris #put real score set here

#target - what we are predicting
theTarget <- 'Sepal.Length'

#set the formula
theFormula <- as.formula(paste(theTarget," ~ . "))

#find the position of the target
targindex <-  which(names(databuild)==theTarget)

#actuals
build_actuals <- databuild[,targindex]


#######################################
#vectors to score the model outputs
#######################################
buildcases <- nrow(databuild)
scorecases <- nrow(datascore)

pred_train <- vector(length=buildcases)
pred_test <- vector(length=buildcases)
pred_score <- vector(length=scorecases)

pred_trainLoop <- vector(length=buildcases)
pred_testLoop <- vector(length=buildcases)
pred_scoreLoop <- vector(length=scorecases)

#settings
numloops <- 300
numfolds <- 10

test_errors <- vector(length=numloops)
train_errors <- vector(length=numloops)

pred_testLoop <- 0
pred_trainLoop <- 0
pred_scoreLoop <- 0
    
modtype = 'linear regression'

#####################################
# now the work
#####################################        
for(loop in 1:numloops){

    # generate the indicies for each fold    
    id <- sample(rep(seq_len(numfolds), length.out=buildcases))

    # lapply over them:
    indicies <- lapply(seq_len(numfolds), function(a) list(
        test = which(id==a),
        train = which(id!=a)
    ))
    
    #reset the predictions for this loop
    pred_train <- 0
    pred_test <- 0
    pred_score <- 0
    
        for(fold in 1:numfolds){
            
            #set the cases for this fold
            rows_train <- indicies[[fold]]$train
            rows_test  <- indicies[[fold]]$test

            #build the models - use any model
            model <- lm(theFormula, data=databuild[rows_train,])

            #score up the model
            buildPred <- predict(model, databuild, type="response")
            scorepred <- predict(model, datascore, type="response")
            
            #now score the cv and scoring predictions
            z <- buildPred
            z[rows_test] <- 0
            pred_train <- pred_train + z
            pred_test[rows_test] <- buildPred[rows_test]
            pred_score <- pred_score + scorepred
        } #next fold

    
    #average the predictions on the train set
    pred_train <- pred_train / (numfolds - 1)
    pred_score <- pred_score / numfolds
    
    #add to previous loop results
    pred_trainLoop <- pred_trainLoop + pred_train
    pred_testLoop <- pred_testLoop + pred_test
    pred_scoreLoop <- pred_scoreLoop + pred_score
    
    #calculate the errors    
    train_errors[loop] <- calc_RMSE(build_actuals,pred_trainLoop / loop)
    test_errors[loop] <- calc_RMSE(build_actuals,pred_testLoop / loop)
    
    #report
    cat("\nloop = ",loop,"train error = ",train_errors[loop],"cv error = ",test_errors[loop]) 

    #plot a chart as we go
    if(loop>1){
         plot(test_errors[1:loop],col='blue',type='l',main = paste(modtype,numloops,'by',numfolds,'-fold cross validation'), xlab = 'Repetitions', ylab = 'RMSE',ylim = range(rbind(test_errors[1:loop],train_errors[1:loop])))
        abline(h=test_errors[loop],col='blue')
        points(train_errors[1:loop],type='l',col='red')
        abline(h=train_errors[loop],col='red')
        legend('top',c('test','train'),col=c('blue','red'),lty=1)
    }

} #loop
########################


    #the cross validation predictions and scoring set predictions
     #this is what we are after
    cvPredictions <- pred_testLoop / numloops
    scPredictions <- pred_scoreLoop / numloops

    #plot should show decreasing test error with increasing train error
    plot(train_errors,test_errors,type='p')

Another Data Mining Blog

Sunday, 1 January 2012

Useful R Snippets