2019 << next previous >> |
Congratulations to Samuel Messina for winning the $1,000 milestone prize. The write-up of his technique can be found here.
2019 << next previous >> |
Buy Low Sell High - leaderboard and rule clarification |
2019 <<Previous Next>> |
2019 <<previous next>> |
#----------------------------------------------------------
#
# some R code to test your submission file is valid
#
#----------------------------------------------------------
library(data.table)
#define where the data is
fileToTest <- 'test_submission_file_1.csv'
dataFolder <- "D:/buylowsellhigh/downloaded/"
theDataFile <- paste0(dataFolder,fileToTest)
#read in the data
tradeData <- fread(theDataFile)
#set error message
ermsg <- 'OK'
#these are the column names required
requiredCols <- c('strategyName','keys_pair','enterTime','exitTime','direction','percentPair')
#------------------------
#do some basic checks
#-----------------------
if (!identical(sort(intersect(colnames(tradeData),requiredCols)) , sort(requiredCols))) {
missing_cols <- setdiff(requiredCols,colnames(tradeData))
ermsg <- paste('\n incorrect column names in the trades file. We are mising:\n',paste(missing_cols,collapse = ","))
} else if (class(tradeData$enterTime) == "character") {
ermsg <- '\n the field enterTime contains non numeric values'
} else if (class(tradeData$exitTime) == "character") {
ermsg <- '\n the field exitTime contains non numeric values'
} else if (class(tradeData$direction) == "character") {
ermsg <- '\n the field direction contains non numeric values'
} else if (class(tradeData$percentPair) == "character") {
ermsg <- '\n the field percentPair contains non numeric values'
} else if (nrow(tradeData) == 0){
ermsg <- '\n the file contains no records'
} else if(length(which(is.na(tradeData)==TRUE)) > 0) {
ermsg <- '\n the file contains missing values'
} else if (min(tradeData$exitTime - tradeData$enterTime) <= 0) {
ermsg <- '\n the exit time needs to be after the entry time'
}
if (ermsg != 'OK'){
stop(ermsg)
}
#-------------------------------------
# check that we are not over 100% in
# a trade at any particualr time
#-------------------------------------
timeStep <- 5
tradeData[,xKey := paste(keys_pair,strategyName,direction,sep="_")]
allKeys <- unique(tradeData$xKey)
for (thisKey in allKeys){
pairStrategyTradeData <- subset(tradeData,xKey == thisKey)
E_numberOfTrades <- nrow(pairStrategyTradeData)
cat("\n",thisKey,E_numberOfTrades)
#get the times we are in a trade
for (tradeNumber in 1:E_numberOfTrades){
thisTimes1 <- seq(from = pairStrategyTradeData[tradeNumber,enterTime],to=pairStrategyTradeData[tradeNumber,exitTime- timeStep],by=timeStep )
thisPercent1 <- rep(pairStrategyTradeData[tradeNumber,percentPair],length(thisTimes1))
if (tradeNumber==1){
allTimes1 <- thisTimes1
allPercent1 <- thisPercent1
} else {
allTimes1 <- c(allTimes1,thisTimes1)
allPercent1 <- c(allPercent1,thisPercent1)
}
} #tradeNumber
#make sure the max percent at any one time is not > 1
d <- cbind.data.frame(allTimes1,allPercent1)
posPercents <- aggregate(allPercent1 ~ allTimes1,data=d,sum)$allPercent
maxPercent <- max(posPercents)
if (maxPercent > 1){
aBadTime <- d[which(aggregate(allPercent1 ~ allTimes1,data=d,sum)$allPercent > 1)[1],'allTimes1']
ermsg <- paste("\n you cannot be more than 100% in a trade at any one time:\n",thisKey,"\ntime=",aBadTime,"\npercent =",maxPercent)
stop(ermsg)
}
} # thisKey
cat('\nthe file appears to be OK!')
requiredCols <- c('strategyName','keys_pair','enterTime','exitTime','direction','percentPair')
requiredPairs <- paste0('pair_',1:22)
#do some checks
if (!identical(sort(intersect(colnames(tradeData),requiredCols)) , sort(requiredCols))) {
missing_cols <- setdiff(requiredCols,colnames(tradeData))
ermsg <- paste('\n incorrect column names in the trades file. We are mising:\n',paste(missing_cols,collapse = ","))
} else if (class(tradeData$enterTime) == "character") {
ermsg <- '\n the field enterTime contains non numeric values'
} else if (class(tradeData$exitTime) == "character") {
ermsg <- '\n the field exitTime contains non numeric values'
} else if (class(tradeData$direction) == "character") {
ermsg <- '\n the field direction contains non numeric values'
} else if (class(tradeData$percentPair) == "character") {
ermsg <- '\n the field percentPair contains non numeric values'
} else if (nrow(tradeData) == 0){
ermsg <- '\n the file contains no records'
} else if(length(which(is.na(tradeData)==TRUE)) > 0) {
ermsg <- '\n the file contains missing values'
} else if (min(tradeData$exitTime - tradeData$enterTime) <= 0) {
ermsg <- '\n the exit time needs to be after the entry time'
} else if (nrow(tradeData) < 50){
ermsg <- '\n not enough trades'
} else if (max(tradeData$direction) > 1){
ermsg <- '\n trade direction > 1'
} else if (min(tradeData$direction) < 0){
ermsg <- '\n trade direction < 0'
} else if (length(unique(tradeData$direction)) > 2){
ermsg <- '\n too many trade directions'
} else if (max(tradeData$enterTime %% 5) > 0) {
ermsg <- '\n wrong times'
} else if (max(tradeData$exitTime %% 5) > 0) {
ermsg <- '\n wrong times'
} else if (min(tradeData$percentPair) < 0) {
ermsg <- '\n negative percent pairs'
} else if ( length(intersect(unique(tradeData$keys_pair),requiredPairs)) == 0 ){
ermsg <- '\n you need to predict for all pairs'
} else if ( !identical(sort(intersect(unique(tradeData$keys_pair),requiredPairs)),sort(requiredPairs))){
ermsg <- '\n you need to predict for all pairs'
} else if (min(tradeData[,.N,by = c('keys_pair','direction')]$N) < 5) {
ermsg <- '\n you need at least 5 trades pairs per '
}
2019 << previous post (getting the data) next>> |
############################################################
#
# sample R code for the 2019 Melbourne Datathon that will
# generate a valid submission file.
#
############################################################
library(data.table)
#----------------------------------------------
#define where the data is & load
#----------------------------------------------
dataFolder <- "D:/buylowsellhigh/downloaded/"
theDataFile <- paste0(dataFolder,"melbdatathon2019_buylowsellhigh.csv")
dt <- fread(theDataFile)
#-------------------------
# strategy settings
#-------------------------
feePerc <- 0.0015 #fixed const for the competition
longThresh <- 0.002 #the value the prediction has to be higher than
longRemainWindow <- 12 #the number of hours to remain in a trade
#---------------------------------------------------
# flag if the criteria for entry is passed
#---------------------------------------------------
dt[,longEntry := 0]
dt[Lpred7b >= longThresh, longEntry := 1]
#----------------------
# set the exit times
#----------------------
exitLongs <- subset(dt,longEntry == 1,select = c('keys_pair','minutesSinceStart'))
exitLongs[,minutesSinceStart := minutesSinceStart + (longRemainWindow * 60)]
exitLongs[,longExit := 1]
#-------------------------------
#merge the entry and exit times
#-------------------------------
myKeys <- c("keys_pair","minutesSinceStart")
setkeyv(dt,myKeys)
setkeyv(exitLongs,myKeys)
dt <- merge(dt,exitLongs,all.x=TRUE)
#-----------------------------
# no match = no exit
#-----------------------------
dt[is.na(longExit), longExit := 0]
#--------------------------------------------------
# if there is an exit and enter, change the exit
#--------------------------------------------------
dt[longEntry == 1 & longExit == 1, longExit := 0]
#-----------------------------------------------------
#assign a group number to consecutive runs of entries
#-----------------------------------------------------
setkeyv(dt,myKeys)
dt[,grp := rleidv(dt, cols="longEntry")]
dt[,grp1 := paste0(keys_pair,grp)]
dt[, inc := cumsum(longEntry),by=grp1]
dt[, inc1 := cumsum(inc),by=grp1]
#--------------------------------------
# the first entry in the run is a BUY
#--------------------------------------
dt[,buy := 0]
dt[longEntry==1 & inc1 == 1,buy :=1 , by=grp1]
#-----------------------------------------------------
#assign a group number to consecutive runs of exits
#-----------------------------------------------------
dt[,grp := rleidv(dt, cols="longExit")]
dt[,grp1 := paste0(keys_pair,grp)]
dt[, inc := cumsum(longExit),by=grp1]
dt[, inc1 := cumsum(inc),by=grp1]
#-------------------------------------
# The first exit in the run is a SELL
#-------------------------------------
dt[,sell := 0]
dt[longExit==1 & inc1 == 1,sell :=1 , by=grp1]
#---------------------------------------------------
# Keep just the rows where we are BUYING or SELLING
#---------------------------------------------------
d1 <- subset(dt,(sell==1 & buy==0 | sell==0 & buy==1),select = c('keys_pair','minutesSinceStart','tradePrice','buy','sell'))
#------------------------------------------
# Make sure each buy is followed by a sell
#------------------------------------------
d1[,prevRowBuy := shift(buy, 1, type="lag") , by=keys_pair]
d1[,prevRowSell := shift(sell, 1, type="lag") , by=keys_pair]
d1 <- subset(d1,prevRowBuy != buy & prevRowSell != sell)
#--------------------------------------------------------------
#find the next price (for buy rows it will be the sell price)
#--------------------------------------------------------------
d1[,sellPrice := shift(tradePrice, 1, type="lead"), by=keys_pair]
d1[,minutesSinceStartExit := shift(minutesSinceStart, 1, type="lead"), by=keys_pair]
d1[, tradeLength := minutesSinceStartExit - minutesSinceStart]
#----------------------------------------------------------------
# now we only need the buy rows as we have the sell time & price
#----------------------------------------------------------------
d1 <- subset(d1,buy==1 & !is.na(minutesSinceStartExit))
#-------------------------------------------------------------------
# long profit estimate (we only have prices for some pairs though)
# this does not include a stoploss calculation
#--------------------------------------------------------------------
d1[,tradeProfit := ( (sellPrice * (1 - feePerc)) - (tradePrice * (1 + feePerc)) )/ tradePrice]
hist(d1$tradeProfit,breaks=100,col='blue');abline(v=0,col='red')
#------------------------------
# generate a submission file
#------------------------------
stratName <- 'demo_LongOnly'
d1[,enterTime := minutesSinceStart]
d1[,exitTime := minutesSinceStartExit]
d1[,direction := 1]
d1[,percentPair := 1]
d1[,strategyName := stratName]
d1 <- subset(d1,select = c('keys_pair','enterTime','exitTime','direction','percentPair','strategyName'))
fwrite(d1,paste0(dataFolder,stratName,'.csv'))
2019 |
#--------------------------------------------------------------------------------------
# some R code to get you started with the 2019 Melbourne Datathon analytic challenge
# download the data from the link below and unzip to 'dataFolder'
# https://drive.google.com/file/d/11bWbg9kSmGXNBUOdGXv7MtWszTfVTUfR (553 mb)
#--------------------------------------------------------------------------------------
library(data.table)
#define where the data is
dataFolder <- "D:/buylowsellhigh/downloaded/"
theDataFile <- paste0(dataFolder,"melbdatathon2019_buylowsellhigh.csv")
#read in the data
dt <- fread(theDataFile)
nrow(dt)
#4,907,361
colnames(dt)
# [1] "keys_pair" "minutesSinceStart" "gap" "barClosePrice" "tradePrice" "Lpred1b" "Lpred2b" "Lpred3b" "Lpred4b"
#[10] "Lpred5b" "Lpred6b" "Lpred7b" "Lpred8b" "Lpred9b" "Lpred10b" "Lpred11b" "Lpred12b" "Lpred13b"
#[19] "Lpred14b"
unique(dt$keys_pair)
#[1] "0x_bitcoin" "bitcoin_usdollar" "bitcoincash_usdollar" "cardano_bitcoin" "dash_usdollar" "litecoin_bitcoin" "litecoin_tetherusd" "monero_bitcoin"
#[9] "qtum_bitcoin" "ripple_bitcoin" "ripple_usdollar" "stratis_bitcoin" "tron_tetherusd" "zcash_bitcoin" "pair_1" "pair_2"
#[17] "pair_3" "pair_4" "pair_5" "pair_6" "pair_7" "pair_8" "pair_9" "pair_10"
#[25] "pair_11" "pair_12" "pair_13" "pair_14" "pair_15" "pair_16" "pair_17" "pair_18"
#[33] "pair_19" "pair_20" "pair_21" "pair_22"
#price information is missing for the unnamed pairs
x1 <- subset(dt,keys_pair== "pair_1")
nrow(x1)
#132,721
summary(x1)
#distribution of the predictions
hist(dt$Lpred1b,breaks=100)
#time series of prices and predictions
x <- subset(dt,keys_pair== "bitcoin_usdollar")[1:1000]
plot(x$minutesSinceStart,x$barClosePrice,type='l',col="blue")
plot(x$minutesSinceStart,x$Lpred1b,type='l',col="blue")
2019 |