require(ggplot2)
require(plotly)
require(data.table)
require(stringr)

#setup
#how many initial in-experiment runs (out of 10) shall be skipped
skipRuns <- 3
#maximum number of full repetitions (entire experiments)
fullRepetitions <- 5
#numerical columns in data for which statistics shall be calculated
numCols <- c("RESPONSETIME_AUTO", "REASONER_EVALUATION_TIME", "REASONER_INSTANCE_CREATION_TIME", "REASONER_REASONING_TIME", "REASONER_TRANSLATION_TIME")
#names of files produced during measurement
filenames <- c('measurements-sse.tsv', 'measurements-rtvil.tsv', 'measurements-scenarios.tsv', 'measurements-scenarioVariants.tsv', 'measurements-scaleLog.tsv')
skipTags <- c("CompoundInit")
#folder <- 'benchmark-results-old-1.3.0-20180826/auto/1'
#folder <- 'benchmark-results-old-1.1.0-20180818/auto/1'

#http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/

# na-tolerant length function
my.lengthNA <- function (x, na.rm=FALSE) {
  if (na.rm) 
    sum(!is.na(x))
  else
    length(x)
}

# statistical summary over x, returns mean, median, standard deviation, standard error and confidence interval  
my.summary = function(x, conf.interval=.95) {
  n <- my.lengthNA(x, TRUE)
  res <- list(mean = mean(x), median = median(x), sd = sd(x), se=0, ci=0)
  res$se <- res$sd / sqrt(n)

  # Confidence interval multiplier for standard error
  # Calculate t-statistic for confidence interval: 
  # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
  ciMult <- ifelse(n > 1, qt(conf.interval/2 + .5, n-1), 0) # 0 does not matter -> res$se anyway NA
  res$ci <- res$se * ciMult
  # see also https://www.cyclismo.org/tutorial/R/confidence.html (Section 9.2)
  return(res)
}

# shortens a given URI to the path segments including and after testdata (also including one path segment before testdata)
my.shortenURI = function(uri) {
  tmp <- str_match(uri, ".*/([^/]*/testdata/.*)")
  return (ifelse(is.na(tmp[1,2]), uri, tmp[1,2]))
}

#reads a file produced by the MeasurementCollector of EASy-Producer
# in given folder
# using given filename (infix is constructed from run)
# actual run count for complete experiment repetition
# skipping warmup runs (default global skipRuns)
# statistical colums (default global numCols)
# tags to be skipped (default global skipTags)
#shortens URIs using my.shortenURI
#factors model.name, URI, tag, caller
#applies my.summary for statistical summaries on numCols
#projects to URI, tag, model.name, MODEL_COMPLEXITY and statistics of numCols
my.readFile = function(folder, filename, run=0, skip=skipRuns, statCols=numCols, skipT=skipTags) {
  if (run > 0)  
    tmp.folder <- paste(folder, 'auto', run, sep='/')
  else
    tmp.folder <- folder
  #read the data, use header as colum names, turn them into real identifiers (check.names), avoid reading strings as factors 
  tmp.resTable <- as.data.table(read.table(file = paste(tmp.folder, filename, sep='/'), sep = '\t', header = TRUE, dec=",", check.names=TRUE, stringsAsFactors=FALSE)) 
  #preprocess, only runs later than given run count shall be in the result
  tmp.resTable <- tmp.resTable[runCount > skip]
  #preprocess, shorten URIs for interactive display
  #we must not have factors on URI, but do need factors on later operations as index
  tmp.resTable <- tmp.resTable[,URI:=factor(unlist(lapply(URI, my.shortenURI)))]
  tmp.resTable <- tmp.resTable[,model.name:=factor(model.name)]
  tmp.resTable <- tmp.resTable[,tag:=factor(tag)]
  tmp.resTable <- tmp.resTable[,caller:=factor(caller)]
  if (length(skipT) > 0) {
    tmp.resTable <- tmp.resTable[!(tag %in% skipTags)]
  }
  tmp.resTable <- tmp.resTable[,as.list(unlist(lapply(.SD, my.summary))), list(URI, tag, model.name, MODEL_COMPLEXITY), .SDcols=statCols]
  return (tmp.resTable)
}

my.readData = function(folder, repetitions=1) {
  tmp.groupedTables <- list()
  tmp.count <- 0;
  for (f in 1:length(filenames)) {
    tmp.resTables <- list()
    for (run in 1:fullRepetitions){
      #read the data
      tmp.resTable <- my.readFile(folder, filenames[f], run) 
      #and "append" to list
      tmp.resTables[[run]] <- tmp.resTable
    }  
    #stack all tables for one-step analysis
    tmp.table<-rbindlist(tmp.resTables)
    #group stacked table by URI, tag and modelName, turn all cols in numCols/groups into table view .SD and apply summary function 
    #unpack result of summary so that names are taken over in colum names

    #match(numCols, names(data))    
    tmp.table <- tmp.table[,as.list(unlist(lapply(.SD, my.summary))), list(URI, tag, model.name, MODEL_COMPLEXITY), .SDcols=numCols]
    tmp.groupedTables[[f]] <- tmp.table
  }
  #stack all grouped tables
  data <- rbindlist(tmp.groupedTables)
  return (data);  
}

#return col in data, i.e., either access it by name if col is a string or return col directly
#(assuming that col is then something like data$col)
my.get = function(data, col) {
    if (is.character(col))
      res<-data[[col]]
    else
      res<-col
    return(res)
}

# draws a diagram for data using xCol (default MODEL_COMPLEXITY), y-axis and error-bar y values as given
# URI as label (for interactive use ggplotly(tooltip=c("label")) after plot(gg)) and tags as values.
# default settings can be changed through data$... or column names as strings
my.createDiagram = function(data, yCol, yColName, yBarCol, diagramTitle="", xCol="MODEL_COMPLEXITY", xColName="model complexity", labelCol="URI", colorCol="tag", colorLegendName="") {
  size <- length(data)
  bar.width = size * 2 # unclear, with seems to be correlated to the size, stackoverflow.com/questions/19420903/wideth-of-error-bars-in-ggplot2
  gg<-ggplot(data, aes(x=my.get(data, xCol),y=my.get(data, yCol),label=my.get(data, URI),color=my.get(data, tag))) + 
    geom_errorbar(aes(ymin=my.get(data, yCol)-my.get(data, yBarCol), ymax=my.get(data, yCol)+my.get(data, yBarCol)), width=bar.width) + 
    geom_point() +
    labs(x=xColName, y=yColName, title = diagramTitle, color=colorLegendName)
  return(gg)
}

data <- my.readFile('W:/offlineFiles/EASy-concepts/reasoner/measures/new', 'measurements-sse.tsv')
gg <- my.createDiagram(data, data$REASONER_REASONING_TIME.mean, "mean reasoning time", data$REASONER_REASONING_TIME.ci, diagramTitle="Reasoning time")


#library(data.table)
#t[, print(.SD), list(URI)]
#t1 <- t[,sapply(.SD,my.summary), list(URI, tag), .SDcols=c("RESPONSETIME_AUTO")]
#t1 <- t[,sapply(.SD,my.summary), list(URI, tag), .SDcols=c("RESPONSETIME_AUTO", "REASONER_EVALUATION_TIME", "REASONER_INSTANCE_CREATION_TIME", "REASONER_REASONING_TIME", "REASONER_TRANSLATION_TIME")]

#data <- my.readData('W:/offlineFiles/EASy-concepts/reasoner/measures/benchmark-results-new-1.3.0-20180826')


#sse <- read.table(file = paste(folder, 'measurements-sse.tsv', sep='/'), sep = '\t', header = TRUE, dec=",")
#rtvil <- read.table(file = paste(folder, 'measurements-rtvil.tsv', sep='/'), sep = '\t', header = TRUE, dec=",")
#scenarios <- read.table(file = paste(folder, 'measurements-scenarios.tsv', sep='/'), sep = '\t', header = TRUE, dec=",")
#scenarioVars <- read.table(file = paste(folder, 'measurements-scenarioVariants.tsv', sep='/'), sep = '\t', header = TRUE, dec=",")
#scaleLog <- read.table(file = paste(folder, 'measurements-scaleLog.tsv', sep='/'), sep = '\t', header = TRUE, dec=",")

#data <- rbind(sse, rtvil, scenarios, scenarioVars, scaleLog)
#reasoning <- data[data$tag == "REASONING", ]

#gg<-ggplot(data, aes(x=MODEL_CONSTRAINT_COMPLEXITY,y=REASONER_REASONING_TIME,label=model.name,color=tag))+geom_point()
#gg<-ggplot(data, aes(x=MODEL_CONSTRAINT_COMPLEXITY+MODEL_VARIABLES,y=REASONER_REASONING_TIME,label=caller,color=tag))+geom_point()
#gg<-ggplot(data, aes(x=MODEL_COMPLEXITY,y=REASONER_REASONING_TIME,label=model.name,color=tag))+geom_point()
#plot(gg)

#gg<-ggplot(data, aes(x=MODEL_COMPLEXITY,y=REASONER_REASONING_TIME.mean,label=URI,color=tag)) + 
#  geom_errorbar(aes(ymin=REASONER_REASONING_TIME.mean-REASONER_REASONING_TIME.ci, ymax=REASONER_REASONING_TIME.mean+REASONER_REASONING_TIME.ci), size=1, width=50) + 
#  geom_point()
#ggplotly(tooltip=c("label"))