final.R

---
title: "soccerproject"
author: "Neeraj"

---

-------------------------------------------------------------------------
-------------------------------------------------------------------------


```{r}
#install.packages("elastic")
library('elastic')

connect(es_port = 9200)
count(index='spainsoccer')
Search(index = "spainsoccer", size=1, id= 10 )$hits$hits #to get 10 id data for type= liga_data
laliga <- Search(index = "spainsoccer", size=3920, asdf = T) #dataframe
laligadf <- laliga$hits$hits$'_source'
ncol(laligadf)
nrow(laligadf)
colnames(laligadf)

#now for the purpose of this project, we would exclude most of the betting variables, as they are not important right 
#now.

liga <- laligadf[,c("Div","Date","HomeTeam","AwayTeam","FTHG","FTAG","FTR","HTHG","HTAG","HTR","HS","AS","HST","AST",
                   "HC","AC","HF","AF","HY","AY","HR","AR","B365A","B365H","B365D")]


#the variables removed above manually were depending on their importance in any theoritical w.r.t. to this project. 
#we will further clean the data and add or remove depending on how influencing each variable is, and their importance 
#etc


```


Variables | Description
----------| -----------
`Div` | `League Division`
`Date` | `Match Date (dd/mm/yy)`
`HomeTeam` | `Home Team`
`AwayTeam` | `Away Team`
`FTHG` | `Full Time Home Team Goals`
`FTAG` | `Full Time Away Team Goals`
`FTR` | `Full Time Result (H=Home Win, D=Draw, A=Away Win)`
`HTHG` | `Half Time Home Team Goals`
`HTAG` | `Half Time Away Team Goals`
`HTR` | `Half Time Result (H=Home Win, D=Draw, A=Away Win)`
`HS` | `Home Team Shots`
`AS` | `Away Team Shots`
`HST` | `Home Team Shots on Target`
`AST` | `Away Team Shots on Target`
`HC` | `Home Team Corners`
`AC` | `Away Team Corners`
`HF` | `Home Team Fouls Committed`
`AF` | `Away Team Fouls Committed`
`HY` | `Home Team Yellow Cards`
`AY` | `Away Team Yellow Cards`
`HR` | `Home Team Red Cards`
`AR` | `Away Team Red Cards`
`B365H` | `Bet365 home win odds`
`B365D` | `Bet365 draw odds`
`B365A` | `Bet365 away win odds`


## Load the libraries.


```{r, echo=TRUE,eval=TRUE, warning=FALSE, message=FALSE}
library(reshape2)
library(lubridate)
library(RJSONIO)
library(plyr)
library(dplyr)
library(data.table)
require(corrplot, quietly=TRUE)
require(fBasics, quietly=TRUE)
library(ROCR)
require(ggplot2, quietly=TRUE)
library(ltm)
library(gclus)
library(odds.converter)
library(caret)
library(rpart)
library(e1071)
library(randomForest)
library(gbm)
library(nnet)
library(MASS)
require(car, quietly=TRUE)
library(gbm)
library(kernlab)
```


we will change the date variable from char to date format.


```{r}
str(liga)
summary(liga)
#liga$Date <- ymd(liga$Date) #POSIXct format object, which works with both factors or characters

#first and foremost we will see if we have any missing values.
sapply(liga, function(x) sum(is.na(x)))

```


The summary shows we do not have ANY missing values, which is a good sign.


```{r}
#correlation plot

# for this purpose we will subset the  numeric variables first.

liga_num <- liga[ , c(5,6,8,9,11:25)] 
liga.cor <- cor(liga_num, use="pairwise", method="pearson")
# Order the correlations by their strength.
ord <- order(liga.cor[1,])
liga.cor <- liga.cor[ord, ord]
corrplot(liga.cor, mar=c(0,0,1,0))
liga.cor
rcor.test(liga_num, method = "pearson")

# heat map correlation
title <- "correlation heat map" 
corp <- qplot(x=Var1, y=Var2, data=melt(cor(liga_num, use="p")), fill=value, geom="tile") +  
      scale_fill_gradient2(limits=c(-1, 1))
corp <- corp + theme(axis.title.x=element_blank(), axis.text.x=element_blank()  
                     , axis.ticks=element_blank())
corp <- corp + ggtitle(title)  
corp  


#scatter plot matrix
dta.col <- dmat.color(liga.cor) # get colors
# reorder variables so those with highest correlation
# are closest to the diagonal
cpairs(liga_num, ord, panel.colors=dta.col, gap=.5,
main="Variables Ordered and Colored by Correlation" )
```

some of the variables are slightly correlated (however it depends on the threshold we choose to subset or accept a particular variable, based on correlation value.) We would as of now, retain all the variables.


## further exploring the data

```{r, message=FALSE, warning=FALSE}

ggplot(liga, aes(x=FTR)) + geom_histogram(binwidth=5)

# explaining Betting variables are not linearly correlated as was assumed from correlation plot above.
ggplot(liga, aes(log(B365A),log(B365D), color=FTR)) + geom_point() + geom_smooth()  
ggplot(liga, aes(log(B365A),log(B365H), color=FTR)) + geom_point() + geom_smooth()  
```


- Also the general understanding from the first plot is that `Home` advantage is significant and plays good role in most cases in determining the winner, as full time results has more home wins than aways. 


## Creating derived new metrics, by executing operations on various columns, and cleaning data (removing redundant variables).

```{r}
#We create the target variable, `winner` for each match. This will be used in our model for prediction.

#liga$winner[liga$FTR=="H"]<- 1 #when home team wins
#liga$winner[liga$FTR=="A"]<- 0 #when away team wins
#liga$winner[liga$FTR=="D"]<- 2 #when neither team wins, game ends in draw.

liga<- liga[-1] #remove first column as it is irrelavant,we already understand that teams data is from Div1.

# we will use odds.converter package to convert the available Bet 365 home, away and draw odds for each match in to respective probabilities of home team winning, away team winning, or resulting a draw according Bet 365 officials.


liga$HomeWinodd <- odds.dec2prob(liga$B365H)
liga$AwayWinodd <- odds.dec2prob(liga$B365A)
liga$Drawodd <- odds.dec2prob(liga$B365D)
#now we can use betting data in two ways- we can either keep it and include in our model to estimate winning team once we know the betting estimates from the experts or we can exclude it from the current model and utilize it at the end to make comparison with our estimates for win, draw or loss.
```


```{r}

p <- ggplot(liga, aes(HR, AR),colour = factor(FTR)) + geom_point()
# With one variable
p + facet_grid(. ~ FTR)
```


- we will further explore this part through visualization. We see that away team wins when home teams concedes more foul which turns into red cards. The insights are little unexpected as well. First we see that, in away teams win, maximum number of players red carded was 2, where home team accounted for 3 for a lossing cause. Secondy, even if away team gets 2 red cards and home team gets 0, away team wins, which could be credited to two reasons, one, that away team was stringer than home team and therefore, red cards played little or no part in the match against wekaer opponents for them, or second, the red cards plaayers were subjected to came into play very ate in the game, that is, towards the end of the game.

- For home team winning, maximum cards away teams recieved were 4, and home team 2. However it is interesting to note that when away team got 4 cards in a game, which means out of 11 players, 4 were sent off, home team got NIL, so their team was more in the field and covered more and was able to dominate. 

- However for the game that ended in draw, away team got max 3 cards, while home team 2. Hence again, it could be cause away teams after getting their players sent off, approached a defensive mind set in to play and considered draw a favorable result for them. As they are involved in more red cards during draw than home teams, and it also makes sense, as the stadium and fans are behind home team,not away team. For away team, winning in such condition with players out from game is always tough. 


#create training and test dataset

```{r}
# now final selection of the variables.

d4 <- as.Date(liga$Date, "%d/%m/%y")
d4 <- strftime(d4, "%Y-%m-%d")
liga$Date <- d4

# dividing dataset into training, validation and testing dataset.
# we will want to predict results for current season correctly, so we will choose it as test dataset. Current season starts from month of august, hence we will subset this data.
liga$B365H <- NULL
liga$B365A <- NULL
liga$B365D<- NULL


## =============================================================================
## Normalise Data
## =============================================================================

## Pre-process predictors
pp <- preProcess(liga, method = c("center", "scale", "BoxCox"))
liga <- predict(pp, liga)

liga$FTHG <- NULL
liga$FTAG <- NULL
liga$HomeWinodd <- NULL
liga$AwayWinodd <- NULL
liga$Drawodd <- NULL
liga_test <- subset(liga, Date > "2015-08-01" )
liga_train <- subset(liga, Date < "2015-08-01" )
liga_train <- liga_train[-c(1,2,3)]
liga_test <- liga_test[-c(1,2,3)]

```


## Pre-model building.

```{r}
library(colorspace)
numeric <- liga_train[-c(1,4)]

categoric <- liga_train[ "HTR"]

target  <- liga_train["FTR"]
require(Hmisc, quietly=TRUE)

# Principal Components Analysis (on numerics only).

pc <- prcomp(na.omit(numeric), scale=TRUE, center=TRUE, tol=0)
# Show the output of the analysis.
pc
# Summarise the importance of the components found.
summary(pc)

#Summary shows first 11 principal components are able to explain about 94% of the variability in the dataset.

# Display a plot showing the relative importance of the components.

plot(pc, main="")
title(main="Principal Components Importance")
axis(1, at=seq(0.7, ncol(pc$rotation)*1.2, 1.2), labels=colnames(pc$rotation), lty=0)

# Display a plot showing the two most principal components.
biplot(pc, main="")
title(main="Principal Components")

```


## Building the model.


```{r, warning=FALSE, message=FALSE}
options(warn=-1)
# Reset the random number seed to obtain the same results each time.


# Build the Decision Tree model.
start.time <- Sys.time() 
set.seed(12)
liga_dt <- rpart(FTR ~ .,
    data=liga_train,
    method="class",
    parms=list(split="information"),
    control=rpart.control(usesurrogate=0, 
        maxsurrogate=0))
end.time <- Sys.time()
time.taken1 <- end.time - start.time

# Generate a textual view of the Decision Tree model.
print(liga_dt)
printcp(liga_dt)

# Plot the resulting Decision Tree. 
# We use the rpart.plot package.
library(rattle)
library(rpart.plot)
fancyRpartPlot(liga_dt, main="Decision Tree- FTR")

# List the rules from the tree using a Rattle support function.

asRules(liga_dt)
  

# Random Forest 

# The 'randomForest' package provides the 'randomForest' function.

# Build the Random Forest model.

liga_train$HTR <- as.factor(liga_train$HTR)
liga_train$FTR <- as.factor(liga_train$FTR)
liga_test$HTR <- as.factor(liga_test$HTR)
liga_test$FTR <- as.factor(liga_test$FTR)

start.time <- Sys.time() 
set.seed(1234)
liga_rf <- randomForest(formula = (FTR) ~ .,data = liga_train,ntree = 500, mtry = 3, 
                        importance = TRUE,na.action=na.roughfix, replace = FALSE)


end.time <- Sys.time()
time.taken2 <- end.time - start.time


# Generate textual output of 'Random Forest' model.

liga_rf

# List the importance of the variables.

rn <- round(importance(liga_rf), 2)
rn[order(rn[,3], decreasing=TRUE),]

# Plot the relative importance of the variables.

varImpPlot(liga_rf, main="")
title(main="Variable Importance Random Forest")

# Plot the error rate against the number of trees.

plot(liga_rf, main="")
legend("topright", c("OOB", "A", "D", "H"), text.col=1:6, lty=1:3, col=1:3)
title(main="Error Rates Random Forest")

# Display tree number 1.

#printRandomForests(liga_rf, 1)

# Plot the OOB ROC curve.

require(verification)
aucc <- roc.area(as.integer(as.factor(liga_train$FTR))-1,
                 liga_rf$votes[,2])$A
roc.plot(as.integer(as.factor(liga_train$FTR))-1,
         liga_rf$votes[,2], main="")
legend("bottomright", bty="n",
       sprintf("Area Under the Curve (AUC) = %1.3f", aucc))
title(main="OOB ROC Curve Random Forest")


# Build a Support Vector Machine model.

start.time <- Sys.time() 
set.seed(567890)
liga_ksvm <- ksvm(FTR ~ .,
      data=liga_train,
      kernel="rbfdot",
      prob.model=TRUE)
end.time <- Sys.time()
time.taken3 <- end.time - start.time  
# Generate a textual view of the SVM model.

liga_ksvm

# Time taken: 1.22 secs

# Generalized Boosted Regression Models (gbm) model.


liga_gbm <- gbm(FTR~., data = liga_train, distribution= "multinomial",n.trees=100, shrinkage= 0.05,
                interaction.depth=3, cv.folds=3, verbose=FALSE,n.cores=1)
liga_gbm
best.iter <- gbm.perf(liga_gbm,method="OOB")
print(best.iter)
best.iter <- gbm.perf(liga_gbm,method="cv")
print(best.iter)
summary(liga_gbm, n.trees=1)
summary(liga_gbm, n.trees=best.iter)
print(pretty.gbm.tree(liga_gbm,1))
print(pretty.gbm.tree(liga_gbm,liga_gbm$n.trees))

gbm_pr <- predict.gbm(liga_gbm, liga_test, best.iter, 
                                  type="response")


#naive bayes.
start.time <- Sys.time() 
set.seed(567890)
liga_nb <- naiveBayes(FTR ~ ., data = liga_train, laplace = 3)
liga_nb
end.time <- Sys.time()
time.taken4 <- end.time - start.time  
# Regression model - GLM

# Build a multinomial model using the nnet package.

# Summarise multinomial model using Anova from the car package.


# Build a Regression model.

start.time <- Sys.time()
set.seed(678)
liga_glm <- multinom(FTR ~ ., data=liga_train, trace=FALSE, maxit=1000)
end.time <- Sys.time() 
time.taken5 <- end.time - start.time  
# Generate a textual view of the Linear model.
mostImportantVariables <- varImp(liga_glm)
mostImportantVariables$Variables <- row.names(mostImportantVariables)
mostImportantVariables <- mostImportantVariables[order(-mostImportantVariables$Overall),]
print(head(mostImportantVariables))


liga_summary <- summary(liga_glm,Wald.ratios=TRUE)
liga_summary
cat(sprintf("Log likelihood: %.3f (%d df)", logLik(liga_glm)[1], attr(logLik(liga_glm), "df")))

cat('==== ANOVA ====')
print(Anova(liga_glm))

```


```
## Making prediction on test set. 


liga_test$HTR <- as.factor(liga_test$HTR)
liga_test$FTR <- as.factor(liga_test$FTR)
# Generate an Error Matrix for the Decision Tree model.

# Obtain the response from the Decision Tree model.

liga_pr <- predict(liga_dt, newdata=liga_test, type="class")
liga_pr
# Generate the confusion matrix showing counts.
cm <- confusionMatrix(liga_pr, liga_test$FTR)
cm
#View(data.frame(cbind(as.matrix(liga_pr))[,1], as.matrix(liga_test$FTR))) #important.
overallcm <- cm$overall
overallcm
# Generate an Error Matrix for the Random Forest model.

# Obtain the response from the Random Forest model.


ligarf_pr <- predict(liga_rf, newdata=(liga_test))
liga_pr
# Generate the confusion matrix showing counts.
rfcm <- confusionMatrix(ligarf_pr, liga_test$FTR)
rfcm
#View(data.frame(cbind(as.matrix(ligarf_pr))[,1], as.matrix(liga_test$FTR))) #important.
overallrfcm <- rfcm$overall
overallrfcm
# Generate an Error Matrix for the SVM model.

# Obtain the response from the SVM model.

liga_kvsmpr <- predict(liga_ksvm, newdata=liga_test)
liga_kvsmpr
# Generate the confusion matrix showing counts.
kvsmcm <- confusionMatrix(liga_kvsmpr, liga_test$FTR)
kvsmcm
#View(data.frame(cbind(as.matrix(liga_kvsmpr))[,1], as.matrix(liga_test$FTR))) #important.
overallkvsm <- kvsmcm$overall
overallkvsm
# Generate an Error Matrix for the Linear model.

# Obtain the response from the Linear model.

gpr <- predict(liga_glm, newdata=liga_test)
gpr
# Generate the confusion matrix showing counts.
gcm <- confusionMatrix(gpr, liga_test$FTR)
gcm
#View(data.frame(cbind(as.matrix(gpr))[,1], as.matrix(liga_test$FTR))) #important.
overallg <- gcm$overall
overallg


# naive bayes.
nb_pr <- predict(liga_nb, newdata=liga_test)
nb_pr
nbcm <- confusionMatrix(nb_pr, liga_test$FTR)
nbcm
#View(data.frame(cbind(as.matrix(nb_pr))[,1], as.matrix(liga_test$FTR))) #important.
overallnb <- nbcm$overall
overallnb
```


# Evaluating the models- tuning- cross validation for each model.

- Decision tree.

```{r, warning=FALSE, message=FALSE}
liga_test[, 'FTR'] <- as.factor(liga_test[, 'FTR'])
liga_train[, 'FTR'] <- as.factor(liga_train[, 'FTR'])

library(snowfall)
sfInit (parallel=TRUE , cpus=5)


start.time <- Sys.time()  
set.seed(2) 
fit1 <- train(FTR ~ ., data = liga_train, method = "rpart", tuneLength = 30, 
      trControl = trainControl(method = "cv", repeats = 1, number = 5))
end.time <- Sys.time()  
time.taken6 <- end.time - start.time

fit1

trellis.par.set(caretTheme())
plot(fit1, metric= "Kappa")
plot(fit1, metric= "Accuracy")

sfStop()

```


- GBM

```{r}
set.seed(4678)
sfInit (parallel=TRUE , cpus=5)
start.time <- Sys.time()  
fit2 <- train(FTR ~ ., data = liga_train, method = "gbm",verbose = FALSE, trControl = trainControl(## 5-fold CV
                           method = "cv",
                           number =5,
                           repeats = 1))
end.time <- Sys.time()  
time.taken7 <- end.time - start.time

fit2

trellis.par.set(caretTheme())
plot(fit2, metric="Kappa")
plot(fit2, metric="Accuracy")

sfStop()
```


- Random Forest

```{r}
sfInit (parallel=TRUE , cpus=5)
set.seed(238)
start.time <- Sys.time()  
  
fit3 <- train(FTR ~ ., data = liga_train, method = "rf", importance=TRUE, 
                trControl=trainControl(method = "cv",  
                             number = 5,
                             repeats = 1, selectionFunction = "oneSE"),
                  prox=TRUE, allowParallel=TRUE)
end.time <- Sys.time()  
time.taken8 <- end.time - start.time
fit3
  
trellis.par.set(caretTheme())
plot(fit3, metric="Kappa")
plot(fit3, metric="Accuracy")
  
sfStop()

```


- Naive bayes


```{r}
sfInit (parallel=TRUE , cpus=5)
set.seed(456)  
start.time <- Sys.time()  

fit4 <- train( FTR ~ ., data = liga_train, method = "nb", trControl = trainControl(method = "cv", number = 5, repeats= 1)) 
end.time <- Sys.time()  
time.taken9 <- end.time - start.time
fit4


plot(fit4, metric="Kappa")
plot(fit4, metric="Accuracy")

sfStop()

```


- svm

```{r}

sfInit (parallel=TRUE , cpus=5)
set.seed(765)
start.time <- Sys.time()  

fit5 <- train(FTR ~ ., data = liga_train, method = "svmLinear",  tuneLength = 30, trControl =trainControl(method = "cv", repeats = 1, number=5))
end.time <- Sys.time()  
time.taken10 <- end.time - start.time
fit5

#plot.train(fit5, metric="Kappa")
#plot(fit5, metric="Accuracy")

sfStop()
```


- Multinom GLM


```{r}
sfInit (parallel=TRUE , cpus=5)
set.seed(239)
start.time <- Sys.time()  

fit6 <- train(FTR ~ ., data = liga_train, method = "multinom", maxit=1000, tuneLength=1, trControl = trainControl(method = "cv", number=5,repeats=1, savePredictions=TRUE))
end.time <- Sys.time()  
time.taken11 <- end.time - start.time
fit6

sfStop()
```


- We tried to tune and see if our model was under or over fit.


## Model Selection.

- Since models are fit on the same versions of the training data, it makes sense to make inferences on the differences between models. In this way we reduce the within-resample correlation that may exist. We can compute the differences as well, for t-test.

```{r}
# statistical statements about their performance differences. 

results <- resamples(list(DT=fit1, GBM=fit2, RF= fit3,NB= fit4, SVM=fit5, GLM= fit6))
summary(results)
bwplot(results)
dotplot(results)
splom(results)

#more direct comparison.
#  t-test to evaluate the null hypothesis that there is no difference between models.
difValues <- diff(results)
summary(difValues)
bwplot(difValues)
trellis.par.set(caretTheme())
dotplot(difValues)

```


- Based on several techniques implemented above for accuracy and kappa values of each model, and comparing their performances it is found **Linear - SVM** performances better than the other models, both in accuracy as well as kappa values, we will therefore use it for **test** dataset to get our results.


## testing on test set for selected model (SVM)
```{r}
p5 <- predict(fit5, newdata=liga_test) #svm
cm5 <- confusionMatrix(p5, liga_test$FTR)
cm5
set.seed(75555)
fit5prob <- train(FTR ~ ., data = liga_train, method = "svmLinear",  tuneLength = 30, 
                  trControl =trainControl(classProbs = T, method = "cv", repeats = 1, number=5))
p5prob <- predict(fit5prob, newdata=liga_test, type="prob") #svm prob
p5prob
```


### Our second model (additional model), is based on the situation when wwe do not have half time results as well, or half time goals, from each team. Above model is valuable when we want to predict for a winner withh half time results. Lets try without those results, and see how accurately we ccan then predict for a winner of a match at the start. For this we will remove HTHG, HTAG, HTR from our selected features(predictors.) in both train and test data set.


```{r}
#### improvising features. 


liga_train$HTHG <- NULL
liga_train$HTAG <- NULL
liga_train$HTR <- NULL
liga_test$HTHG <- NULL
liga_test$HTAG <- NULL
liga_test$HTR <- NULL

#################################
#now lets build the model again and cross validate.

options(warn=-1)
# Reset the random number seed to obtain the same results each time.


# Build the Decision Tree model.
start.time <- Sys.time() 
set.seed(12)
liga_dtnew <- rpart(FTR ~ .,
    data=liga_train,
    method="class",
    parms=list(split="information"),
    control=rpart.control(usesurrogate=0, 
        maxsurrogate=0))
end.time <- Sys.time()
time.taken1new <- end.time - start.time

# Generate a textual view of the Decision Tree model.
print(liga_dtnew)
printcp(liga_dtnew)

# Plot the resulting Decision Tree. 
# We use the rpart.plot package.
fancyRpartPlot(liga_dtnew, main="Decision Tree- FTR")

# List the rules from the tree using a Rattle support function.

asRules(liga_dtnew)
  

# Random Forest 

# The 'randomForest' package provides the 'randomForest' function.

# Build the Random Forest model.


liga_train$FTR <- as.factor(liga_train$FTR)

start.time <- Sys.time() 
set.seed(1234)
liga_rfnew <- randomForest(formula = (FTR) ~ .,data = liga_train,ntree = 500, mtry = 3, 
                        importance = TRUE,na.action=na.roughfix, replace = FALSE)


end.time <- Sys.time()
time.taken2new <- end.time - start.time


# Generate textual output of 'Random Forest' model.

liga_rfnew

# List the importance of the variables.

rnnew <- round(importance(liga_rfnew), 2)
rnnew[order(rnnew[,3], decreasing=TRUE),]

# Plot the relative importance of the variables.

varImpPlot(liga_rfnew, main="")
title(main="Variable Importance Random Forest")

# Plot the error rate against the number of trees.

plot(liga_rfnew, main="")
legend("topright", c("OOB", "A", "D", "H"), text.col=1:6, lty=1:3, col=1:3)
title(main="Error Rates Random Forest")

# Display tree number 1.

#printRandomForests(liga_rfnew, 1)

# Plot the OOB ROC curve.

aucc <- roc.area(as.integer(as.factor(liga_train$FTR))-1,
                 liga_rfnew$votes[,2])$A
roc.plot(as.integer(as.factor(liga_train$FTR))-1,
         liga_rf$votes[,2], main="")
legend("bottomright", bty="n",
       sprintf("Area Under the Curve (AUC) = %1.3f", aucc))
title(main="OOB ROC Curve Random Forest")


# Build a Support Vector Machine model.

start.time <- Sys.time() 
set.seed(567890)
liga_ksvmnew <- ksvm(FTR ~ .,
      data=liga_train,
      kernel="rbfdot",
      prob.model=TRUE)
end.time <- Sys.time()
time.taken3new <- end.time - start.time  
# Generate a textual view of the SVM model.

liga_ksvmnew

# Time taken: 1.22 secs

# Generalized Boosted Regression Models (gbm) model.


liga_train$FTR <- as.factor(liga_train$FTR)
liga_gbmnew <- gbm(FTR~., data = liga_train, distribution= "multinomial",n.trees=100, shrinkage= 0.05,
                interaction.depth=3, cv.folds=3, verbose=FALSE,n.cores=1)
liga_gbmnew
best.iternew <- gbm.perf(liga_gbmnew,method="OOB")
print(best.iternew)
best.iternew <- gbm.perf(liga_gbmnew,method="cv")
print(best.iternew)
summary(liga_gbmnew, n.trees=1)
summary(liga_gbmnew, n.trees=best.iter)
print(pretty.gbm.tree(liga_gbmnew,1))
print(pretty.gbm.tree(liga_gbmnew,liga_gbmnew$n.trees))

gbm_prnew <- predict.gbm(liga_gbmnew, liga_test, best.iter, 
                                  type="response")


#naive bayes.
start.time <- Sys.time() 
set.seed(567890)
liga_nbnew <- naiveBayes(FTR ~ ., data = liga_train, laplace = 3)
liga_nbnew
end.time <- Sys.time()
time.taken4new <- end.time - start.time  
# Regression model - GLM

# Build a multinomial model using the nnet package.

# Summarise multinomial model using Anova from the car package.


# Build a Regression model.

start.time <- Sys.time()
set.seed(678)
liga_glmnew <- multinom(FTR ~ ., data=liga_train, trace=FALSE, maxit=500)
end.time <- Sys.time() 
time.taken5new <- end.time - start.time  
# Generate a textual view of the Linear model.
mostImportantVariablesnew <- varImp(liga_glmnew)
mostImportantVariablesnew$Variables <- row.names(mostImportantVariablesnew)
mostImportantVariablesnew <- mostImportantVariablesnew[order(-mostImportantVariablesnew$Overall),]
print(head(mostImportantVariablesnew))


liga_summarynew <- summary(liga_glmnew,Wald.ratios=TRUE)
liga_summarynew
cat(sprintf("Log likelihood: %.3f (%d df)", logLik(liga_glmnew)[1], attr(logLik(liga_glmnew), "df")))

cat('==== ANOVA ====')
print(Anova(liga_glmnew))


```


```
##making predictions on test set.

# Obtain the response from the Decision Tree model.

liga_prnew <- predict(liga_dtnew, newdata=liga_test, type="class")
liga_prnew
# Generate the confusion matrix showing counts.
cmnew <- confusionMatrix(liga_prnew, liga_test$FTR)
cmnew
#View(data.frame(cbind(as.matrix(liga_pr))[,1], as.matrix(liga_test$FTR))) #important.
overallcmnew <- cmnew$overall
overallcmnew
# Generate an Error Matrix for the Random Forest model.

# Obtain the response from the Random Forest model.


ligarf_prnew <- predict(liga_rfnew, newdata=(liga_test))
ligarf_prnew
# Generate the confusion matrix showing counts.
rfcmnew <- confusionMatrix(ligarf_prnew, liga_test$FTR)
rfcmnew
#View(data.frame(cbind(as.matrix(ligarf_pr))[,1], as.matrix(liga_test$FTR))) #important.
overallrfcmnew <- rfcmnew$overall
overallrfcmnew
# Generate an Error Matrix for the SVM model.

# Obtain the response from the SVM model.

liga_kvsmprnew <- predict(liga_ksvmnew, newdata=liga_test)
liga_kvsmprnew
# Generate the confusion matrix showing counts.
kvsmcmnew <- confusionMatrix(liga_kvsmprnew, liga_test$FTR)
kvsmcmnew
#View(data.frame(cbind(as.matrix(liga_kvsmpr))[,1], as.matrix(liga_test$FTR))) #important.
overallkvsmnew <- kvsmcmnew$overall
overallkvsmnew
# Generate an Error Matrix for the Linear model.

# Obtain the response from the Linear model.

gprnew <- predict(liga_glmnew, newdata=liga_test)
gprnew
# Generate the confusion matrix showing counts.
gcmnew <- confusionMatrix(gprnew, liga_test$FTR)
gcmnew
#View(data.frame(cbind(as.matrix(gpr))[,1], as.matrix(liga_test$FTR))) #important.
overallgnew <- gcmnew$overall
overallgnew


# naive bayes.
nb_prnew <- predict(liga_nbnew, newdata=liga_test)
nb_prnew
nbcmnew <- confusionMatrix(nb_prnew, liga_test$FTR)
nbcmnew
#View(data.frame(cbind(as.matrix(nb_pr))[,1], as.matrix(liga_test$FTR))) #important.
overallnbnew <- nbcmnew$overall
overallnbnew
```

# Evaluating the models- tuning- cross validation for each model and testing it.


- Decision Tree.


```{r}

sfInit (parallel=TRUE , cpus=5)
set.seed(567)
start.time <- Sys.time()  
set.seed(2) 
fit1new <- train(FTR ~ ., data = liga_train, method = "rpart", tuneLength = 30, 
      trControl = trainControl(method = "cv",number= 5, repeats = 1))
end.time <- Sys.time()  
time.taken6new <- end.time - start.time

fit1new

plot(fit1new, metric= "Kappa")
plot(fit1new, metric= "Accuracy")

sfStop()
```


- GBM

```{r}
sfInit (parallel=TRUE , cpus=5)
set.seed(879)
start.time <- Sys.time()  
fit2new <- train(FTR ~ ., data = liga_train, method = "gbm",verbose = FALSE, trControl = trainControl(## 5-fold CV
                           method = "cv",
                           number =5,
                           ## repeated ten times
                           repeats = 1))
end.time <- Sys.time()  
time.taken7new <- end.time - start.time

fit2new

plot(fit2new, metric="Kappa")
plot(fit2new, metric="Accuracy")

sfStop()

```


- Random Forest

```{r}
sfInit (parallel=TRUE , cpus=5)
set.seed(231)
start.time <- Sys.time()  

fit3new <- train(FTR ~ ., data = liga_train, method = "rf",ntree=500, importance=TRUE, 
              trControl=trainControl(method = "cv",  
                           number = 5,
                           repeats = 1, selectionFunction = "oneSE"),
                prox=TRUE,allowParallel=TRUE)
end.time <- Sys.time()  
time.taken8new <- end.time - start.time
fit3new

plot(fit3new, metric="Kappa")
plot(fit3new, metric="Accuracy")

sfStop()
```


- Naive Bayes

```{r}

sfInit (parallel=TRUE , cpus=5)
set.seed(345)
start.time <- Sys.time()  

fit4new <- train( FTR ~ ., data = liga_train, method = "nb", trControl = trainControl(method = "cv", number = 5, repeats= 1)) 
end.time <- Sys.time()  
time.taken9new <- end.time - start.time
fit4

plot(fit4new, metric="Kappa")
plot(fit4new, metric="Accuracy")

sfStop()
```


- SVM 

```{r}
sfInit (parallel=TRUE , cpus=5)
set.seed(12)
start.time <- Sys.time()  
fit5new <- train(FTR ~ ., data = liga_train, method = "svmLinear", tuneLength = 30, trControl =trainControl(method = "cv", repeats = 1, number=5))
end.time <- Sys.time()  
time.taken10new <- end.time - start.time
fit5new

sfStop()
```


- GLM- Multinom.

```{r}

sfInit (parallel=TRUE , cpus=5)
set.seed(123)
start.time <- Sys.time()  

fit6new <- train(FTR ~ ., data = liga_train, method = "multinom", maxit=1000, tuneLength=1, trControl = trainControl(method = "cv", number=5,repeats=1, savePredictions=TRUE))
end.time <- Sys.time()  
time.taken11new <- end.time - start.time
fit6new

sfStop()

```


## Model Selection.

- Since models are fit on the same versions of the training data, it makes sense to make inferences on the differences between models. In this way we reduce the within-resample correlation that may exist. We can compute the differences as well, for t-test.


```{r}
# statistical statements about their performance differences. 

results_newmodel <- resamples(list(DT2nd=fit1new, GBM2nd=fit2new, RF2nd= fit3new,NB2nd= fit4new, SVM2nd=fit5new, GLM2nd= fit6new))
summary(results_newmodel)
bwplot(results_newmodel)
dotplot(results_newmodel)
splom(results_newmodel)

#more direct comparison.
#  t-test to evaluate the null hypothesis that there is no difference between models.
difValues2nd <- diff(results_newmodel)
summary(difValues2nd)
bwplot(difValues2nd)
trellis.par.set(caretTheme())
dotplot(difValues2nd)
```


- Based on several techniques implemented above for accuracy and kappa values of each model, and comparing their performances it is found **Linear - SVM** performances better than the other models, both in accuracy as well as kappa values, we will therefore use it for **test** dataset to get our results.
- Interestingly for both the models, SVM has outperformed other models.


## testing on test data with ur selected model.


```{r}

p5new <- predict(fit5new, newdata=liga_test)
cm5new <- confusionMatrix(p5new, liga_test$FTR)
cm5new
cm5new$overall #svm
set.seed(67885)
fit5newprob <- train(FTR ~ ., data = liga_train, method = "svmLinear",  tuneLength = 30, 
                  trControl =trainControl(classProbs = T, method = "cv", repeats = 1, number=5))#class probabilities
p5newprob <- predict(fit5newprob, newdata=liga_test, type="prob") #svm prob
p5newprob


```


## Final results.

```{r}
liga_test$PredictedFTRWhenHTRisknown <- as.data.frame(p5)
liga_test$PredictedFTRWhenHTRisUnknown <- as.data.frame(p5new) 
liga_test$PredictedFTRProbWhenHTRisknown <- as.data.frame(p5prob) #probabilities for lose, win or draw for away of home
liga_test$PredictedFTRProbWhenHTRisUnknown <- as.data.frame(p5newprob) #predicted probs for lose, win or draw for away or home


#getting data for betting experts odds and probabilities for comparisons.

d <- as.Date(laligadf$Date, "%d/%m/%y")
d <- strftime(d, "%Y-%m-%d")
laligadf$Date <- d
lig <- subset(laligadf, Date > "2015-08-01")
lig <- lig[, c("B365H", "B365A", "B365D")]
#converting odds to probabilities.
lig$B365H <- odds.dec2prob(lig$B365H)
lig$B365A <- odds.dec2prob(lig$B365A)
lig$B365D <- odds.dec2prob(lig$B365D)

liga_test <- as.data.frame(c(liga_test, lig))
#removing variables(unwanted metrics such as predictors similar to train dataset) from final dataset of test, since we nomore need it.


#final set.

liga_test <- liga_test[, c(1, 14:24)]
head(liga_test)

#plotting the comparison between our predicted class with that of actual class( this is indirect visualization of confusion matrix in a way.)


#first when HTR is known.

spineplot(liga_test$FTR, liga_test$p5)

#Now when HTR is unknown.
spineplot(liga_test$FTR, liga_test$p5new)


#comparing probabilities among each other- betting experts probs vs our prediction probs.


#when HTR is Unknown.

#we will look at the "home" grid(home team winning at the end actually), where when betting experts predicted home win probabilities , it shows that we predicted some of the results to be loss for home team, in favor of  away team (as color indicates) and very little as  "draw" too, but despite that, on comparing the probability numbers, it shows our prediction probs are higher and hence more reliable. 

ggplot(liga_test, aes(x = B365H, y = PredictedFTRProbWhenHTRisUnknown.H, colour = p5new)) + 
  geom_line() + facet_grid(FTR~., scales = "free")

#we will look at the "away" grid(away team winning at the end actually), where when betting experts predicted away win probabilities , it shows that we predicted some of the results to be loss for away team, in favor of  home team (as color indicates) and none as draw,  but despite that, on comparing the probability numbers, it shows our prediction probs are higher and hence more reliable. 

ggplot(liga_test, aes(x = B365A, y = PredictedFTRProbWhenHTRisUnknown.A, colour = p5new)) + 
  geom_line() + facet_grid(FTR~., scales = "free")


#when HTR is known.

#we will look at the "home" grid, where when betting experts predicted home win, it shows that we predicted some of the resutls to be draw or loss as well, but despite that, it shows our prediction probs are higher and hence more reliable. 

ggplot(liga_test, aes(x = B365H, y = PredictedFTRProbWhenHTRisknown.H, colour = p5)) + 
  geom_line() + facet_grid(FTR~., scales = "free")

#we will look at the "away" grid, where when betting experts predicted away, it shows that we predicted some of the resutls to be draw or loss as well, but despite that, it shows our prediction probs are higher and hence more reliable. 
ggplot(liga_test, aes(x = B365A, y = PredictedFTRProbWhenHTRisknown.A, colour = p5)) + 
  geom_line() + facet_grid(FTR~., scales = "free")

# we will look at the "draw" grid, where whn betting experts predicted draw, it shows our prediction probs are higher and hence more reliable.
ggplot(liga_test, aes(x = B365D, y = PredictedFTRProbWhenHTRisknown.D, colour = p5)) + 
  geom_line() + facet_grid(FTR~., scales = "free")

```