#!/usr/bin/Rscript

# Copyright (c) 2014,
# Mathias Kuhring, KuhringM@rki.de, Robert Koch Institute, Germany, 
# All rights reserved. For details, please note the license.txt.

# surankco-training: training of random forest using contig features (from 
#                    surankco-feature) and contig scores (surankco-score)


# get script path
args <- commandArgs(trailingOnly = FALSE)
script.arg <- "--file="
script.name <- sub(script.arg, "", args[grep(script.arg, args)])
script.path <- dirname(script.name)


# testing/debugging
# args <- c("--directory=data")
# script.path <- getwd()
DEBUGGING <- FALSE


# sources and libraries
source(paste(script.path, '/r/parameter.R', sep=""))
source(paste(script.path, '/r/import.R', sep=""))
source(paste(script.path, '/r/rf.R', sep=""))
source(paste(script.path, '/r/scores.R', sep=""))
source(paste(script.path, '/r/expofits.R', sep=""))
loadPackages(c("optparse","MASS","randomForest"), quietly=TRUE)


# parsing parameter
cat("prepare files\n")
parameters <- parseSurankcoTraining()
files <- parameters$files


if (DEBUGGING){
  print(args)
  print(parameters)
  print(files)
}


# import feature and score files
cat("import features and scores\n")
features <- readSurankcoFeatures(files$features.txt)
scores <- readSurankcoFeatures(files$scores.txt)


# merge and preparate data
cat("prepare and merge data\n")
training <- selectContigs(features,scores)
training <- dataFilter(training)

firstFeature <- which(colnames(training)=="Length")
firstScore <- which(colnames(training)=="NormedMatchCount1")
lastScore <- ncol(training)

input <- training[,firstFeature:(firstScore-1)]
targets <- training[,firstScore:lastScore]

cat("divide into classes\n")
if (DEBUGGING){ 
  cat("before:\n")
  summary(targets) 
}
# set thresholds/classes (maybe with fitting)
if ("manual.thresholds" %in% names(parameters)){
  targets <- manualClasses(targets, parameters$manual.thresholds)
}else{
  targets <- expoClasses(targets, parameters$exponential.quantile)
}

# training
if (DEBUGGING){ # show always?
  cat("after:\n")
  summary(targets) 
  
  write.table(cbind(training$ContigID,targets), file="classification.tmp", 
              sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
} 
cat("train random forests\n")
rfs <- rfTraining(input, targets)

# export the RFs
cat("export random forests\n")
save(rfs, file=parameters$output.filename)
#save(rfs, file="surankco_RFs.RData")

# done
cat("surankco-training calculations done\n")
