Another Data Mining Blog: Buy Low Sell High

2019
<<previous next>>

In the previous post there was code to generate a submission file.

Submission files need to be valid so we can process them, otherwise they will be rejected by the calculation engine. Below is some code which your file should pass to ensure it will not be rejected.

#----------------------------------------------------------
#
# some R code to test your submission file is valid
#
#----------------------------------------------------------

library(data.table)

#define where the data is
fileToTest <- 'test_submission_file_1.csv'


dataFolder <- "D:/buylowsellhigh/downloaded/"
theDataFile <- paste0(dataFolder,fileToTest)

#read in the data
tradeData <- fread(theDataFile)

#set error message
ermsg <- 'OK'
  
  #these are the column names required
  requiredCols <- c('strategyName','keys_pair','enterTime','exitTime','direction','percentPair')
  
  #------------------------
  #do some basic checks
  #-----------------------
  if (!identical(sort(intersect(colnames(tradeData),requiredCols)) , sort(requiredCols))) {
    missing_cols <- setdiff(requiredCols,colnames(tradeData))
    ermsg <- paste('\n incorrect column names in the trades file. We are mising:\n',paste(missing_cols,collapse = ","))
  } else if (class(tradeData$enterTime) == "character") {
    ermsg <- '\n the field enterTime contains non numeric values'
  } else if (class(tradeData$exitTime) == "character") {
    ermsg <- '\n the field exitTime contains non numeric values'
  } else if (class(tradeData$direction) == "character") {
    ermsg <- '\n the field direction contains non numeric values'
  } else if (class(tradeData$percentPair) == "character") {
    ermsg <- '\n the field percentPair contains non numeric values'
  } else if (nrow(tradeData) == 0){
    ermsg <- '\n the file contains no records'
  } else if(length(which(is.na(tradeData)==TRUE)) > 0) {
    ermsg <- '\n the file contains missing values' 
  } else if (min(tradeData$exitTime - tradeData$enterTime) <= 0) {
    ermsg <- '\n the exit time needs to be after the entry time'
  }
  

  if (ermsg != 'OK'){
    stop(ermsg)
  }


  #-------------------------------------
  # check that we are not over 100% in 
  # a trade at any particualr time
  #-------------------------------------
  timeStep <- 5
  
  tradeData[,xKey := paste(keys_pair,strategyName,direction,sep="_")]
  allKeys <- unique(tradeData$xKey)
  
  for (thisKey in allKeys){
    
    pairStrategyTradeData <- subset(tradeData,xKey == thisKey)
    E_numberOfTrades <- nrow(pairStrategyTradeData)
    
    cat("\n",thisKey,E_numberOfTrades)
    
    #get the times we are in a trade
    for (tradeNumber in 1:E_numberOfTrades){
      thisTimes1 <- seq(from = pairStrategyTradeData[tradeNumber,enterTime],to=pairStrategyTradeData[tradeNumber,exitTime- timeStep],by=timeStep )
      thisPercent1 <- rep(pairStrategyTradeData[tradeNumber,percentPair],length(thisTimes1))
      if (tradeNumber==1){
        allTimes1 <- thisTimes1
        allPercent1 <- thisPercent1
      } else {
        allTimes1 <- c(allTimes1,thisTimes1)
        allPercent1 <- c(allPercent1,thisPercent1)
      }
    } #tradeNumber
    
    #make sure the max percent at any one time is not > 1
    d <- cbind.data.frame(allTimes1,allPercent1)
    posPercents <- aggregate(allPercent1 ~ allTimes1,data=d,sum)$allPercent
    
    maxPercent <- max(posPercents)
    if (maxPercent > 1){
      aBadTime <- d[which(aggregate(allPercent1 ~ allTimes1,data=d,sum)$allPercent > 1)[1],'allTimes1']
      ermsg <- paste("\n you cannot be more than 100% in a trade at any one time:\n",thisKey,"\ntime=",aBadTime,"\npercent =",maxPercent)
      stop(ermsg)
    }
    
  } #  thisKey

  cat('\nthe file appears to be OK!')

Update 17th Oct 2019.
The file validator is now more strict. The basic checks section in the above code has now expand to the following set of rules:




requiredCols <- c('strategyName','keys_pair','enterTime','exitTime','direction','percentPair')
 
requiredPairs <- paste0('pair_',1:22)
 

 
#do some checks
 
if (!identical(sort(intersect(colnames(tradeData),requiredCols)) , sort(requiredCols))) {
 
missing_cols <- setdiff(requiredCols,colnames(tradeData))
 
ermsg <- paste('\n incorrect column names in the trades file. We are mising:\n',paste(missing_cols,collapse = ","))
 
} else if (class(tradeData$enterTime) == "character") {
 
ermsg <- '\n the field enterTime contains non numeric values'
 
} else if (class(tradeData$exitTime) == "character") {
 
ermsg <- '\n the field exitTime contains non numeric values'
 
} else if (class(tradeData$direction) == "character") {
 
ermsg <- '\n the field direction contains non numeric values'
 
} else if (class(tradeData$percentPair) == "character") {
 
ermsg <- '\n the field percentPair contains non numeric values'
 
} else if (nrow(tradeData) == 0){
 
ermsg <- '\n the file contains no records'
 
} else if(length(which(is.na(tradeData)==TRUE)) > 0) {
 
ermsg <- '\n the file contains missing values'
 
} else if (min(tradeData$exitTime - tradeData$enterTime) <= 0) {
 
ermsg <- '\n the exit time needs to be after the entry time'
 
} else if (nrow(tradeData) < 50){
 
ermsg <- '\n not enough trades'
 
} else if (max(tradeData$direction) > 1){
 
ermsg <- '\n trade direction > 1'
 
} else if (min(tradeData$direction) < 0){
 
ermsg <- '\n trade direction < 0'
 
} else if (length(unique(tradeData$direction)) > 2){
 
ermsg <- '\n too many trade directions'
 
} else if (max(tradeData$enterTime %% 5) > 0) {
 
ermsg <- '\n wrong times'
 
} else if (max(tradeData$exitTime %% 5) > 0) {
 
ermsg <- '\n wrong times'
 
} else if (min(tradeData$percentPair) < 0) {
 
ermsg <- '\n negative percent pairs'
 
} else if ( length(intersect(unique(tradeData$keys_pair),requiredPairs)) == 0 ){
 
ermsg <- '\n you need to predict for all pairs'
 
} else if ( !identical(sort(intersect(unique(tradeData$keys_pair),requiredPairs)),sort(requiredPairs))){
 
ermsg <- '\n you need to predict for all pairs'
 
} else if (min(tradeData[,.N,by = c('keys_pair','direction')]$N) < 5) {
 
ermsg <- '\n you need at least 5 trades pairs per '
 
}

4 comments:

tejaswini14 July 2020 at 02:07
Truly, this article is really one of the very best in the history of articles. I am a antique ’Article’ collector and I sometimes read some new articles if I find them interesting. And I found this one pretty fascinating and it should go into my collection. Very good work!data science certification malaysia
Outsource BigData25 February 2021 at 03:51
Informative blog, We provide end-to-end web data mining services in marketing, healthcare, finance, management & consulting, manufacturing, ITES and market research sectors. Companies globally leverage our web data mining services to have the data they need to improve the bottom line and business process.
Unknown25 October 2021 at 01:48
Thanks for the information about Blogspot very informative for everyone
data science course in chennai
amazingwebdeveloper23 February 2022 at 02:23
It's really true. Hiring a freelancer, especially for the flutter programming language, is difficult. There are so many platforms that promise to get professionals but are unreliable. Can you name some secure, trustworthy, & reliable to assist me in hiring a professional Flutter developer for my projects?

Another Data Mining Blog

Thursday, 5 September 2019

Buy Low Sell High - file checking

4 comments: