Classification Metrics

We use RMSE (Root Mean Squared Error) for evaluating the performance of a prediction model with continuous outcome. However, RMSE is not used commonly for categorical outcomes. Instead, we use a few other metrics are driven from the confusion matrix, which is introduced in the Naive Bayes handouts. Therefore, the first step of discussing the prediction power of a statistical model with a categorical outcome is forming a confusion matrix.

Confusion matrix and Miss-Classification

“Ethnicity, Insurgency, and Civil War” by Fearn and Laitin (2003) published in American Political Science Review is one the most cited studies in conflict literature. We will use this data for this session.

rawData=read.csv("https://raw.githubusercontent.com/babakrezaee/MethodsCourses/master/DataSets/FearonLaitin2003.csv")

head(rawData, n=2)
##   ccode country cname cmark year wars war warl onset ethonset durest aim
## 1     2     USA   USA     1 1945    0   0    0     0        0     NA  NA
## 2     2     USA   USA     0 1946    0   0    0     0        0     NA  NA
##   casename ended ethwar waryrs    pop     lpop polity2 gdpen gdptype
## 1             NA     NA        140969 11.85630      10 7.626       3
## 2             NA     NA        141936 11.86313      10 7.654       3
##   gdpenl lgdpenl1  lpopl1                        region western eeurop
## 1  7.626 8.939319 11.8563 western democracies and japan       1      0
## 2  7.626 8.939319 11.8563 western democracies and japan       1      0
##   lamerica ssafrica asia nafrme colbrit colfra mtnest  lmtnest elevdiff
## 1        0        0    0      0       1      0   23.9 3.214868     6280
## 2        0        0    0      0       1      0   23.9 3.214868     6280
##   Oil ncontig   ethfrac       ef plural second numlang relfrac plurrel
## 1   0       1 0.3569501 0.490957  0.691  0.125       3   0.596      56
## 2   0       1 0.3569501 0.490957  0.691  0.125       3   0.596      56
##   minrelpc muslim nwstate polity2l instab anocl deml empethfrac empwarl
## 1       28    1.9       0       10      0     0    1  0.4936721       0
## 2       28    1.9       0       10      0     0    1  0.3569501       0
##   emponset empgdpenl emplpopl emplmtnest empncontig empolity2l sdwars
## 1        0   6.79718 11.97919   3.211572          1    7.68712      0
## 2        0   6.79718 11.97919   3.214868          1   10.00000      0
##   sdonset colwars colonset cowwars cowonset cowwarl sdwarl colwarl
## 1       0      NA       NA       0        0       0      0      NA
## 2       0      NA       NA       0        0       0      0      NA
Data=na.omit(cbind(rawData$onset,rawData$warl,rawData$gdpenl,rawData$lpopl1,
               rawData$lmtnest,rawData$ncontig,rawData$Oil,rawData$nwstate,rawData$instab,rawData$polity2l,rawData$ethfrac,rawData$relfrac)) 

scf = function(x) {return((x-min(x))/(max(x)-min(x)))}

x=Data[,-1]


x = apply(x,2,scf)

y=Data[,1]

Data=cbind(y,x)

Data=data.frame(Data)

colnames(Data)<- c("onset","warL","gdpenL","populationL","mountainous",
                   "contiguous", "oil", "newStata", "instability","polityL",
                   "ELF", "RelELF")

is.factor(Data$onset)
## [1] FALSE
table(Data$onset)
## 
##    0    1    4 
## 6221  105    1
# It seems there is a mistake in data entry. We need to fix it:

Data$onset[Data$onset>1] <- 1

table(Data$onset)
## 
##    0    1 
## 6221  106
Data$onset=factor(Data$onset)



library(glmnet)

fm_binomial=glm(onset~warL+gdpenL+populationL+mountainous+contiguous+oil+newStata+instability+ polityL+ELF+RelELF, data = Data, family = "binomial")

fm_binomial$fitted.values[1:10]
##           1           2           3           4           5           6 
## 0.009458287 0.009458287 0.009385206 0.008282231 0.007654327 0.008318317 
##           7           8           9          10 
## 0.006527791 0.005844748 0.005942122 0.005525181
library(dplyr)
Yhat_logit = if_else(fm_binomial$fitted.values>.5, 1, 0)

table(Yhat_logit)
## Yhat_logit
##    0 
## 6327

In the above example, we first estimated the probability of civil war onset, and the used a threshold, i.e. \(.5\), to categorize the predicted outcomes to zero, i.e. no civil, and 1, i.e. civil war. Indeed, in classification problems, we often estimate the probability of each category, then we pick a threshold to find the predicted categorical outcome.

Yhat_logit = if_else(fm_binomial$fitted.values>.1, 1, 0)

table(Yhat_logit)
## Yhat_logit
##    0    1 
## 6234   93
Y=Data$onset

ctab=table(Y,Yhat_logit)

ctab
##    Yhat_logit
## Y      0    1
##   0 6144   77
##   1   90   16
missClass=(sum(ctab)-sum(diag(ctab)))/sum(ctab)

perCW=ctab[2,2]/sum(ctab[,2])

cat("Missclassification and Civil War classification rate using Logit model are",round(missClass,2),"% and ",round(perCW,2),"%, respectively \n")
## Missclassification and Civil War classification rate using Logit model are 0.03 % and  0.17 %, respectively
logitProbFit = data.frame(CivilWar=Data$onset,1-fm_binomial$fitted.values,fm_binomial$fitted.values)

names(logitProbFit)[2:3] = c("No", "Yes")

par(mfrow=c(1,2))
par(mai=c(1,1,.5,.5))
plot(No~CivilWar,logitProbFit,col=c(grey(.5),2:3),cex.lab=2.4,cex.axis=2.4)
plot(Yes~CivilWar,logitProbFit,col=c(grey(.5),2:3),cex.lab=2.4,cex.axis=2.4)

library(kknn)
fm_knn = kknn(onset~warL+gdpenL+populationL+mountainous+contiguous+oil+newStata+instability+ polityL+ELF+RelELF,Data,Data,k=10,kernel = "rectangular")

table(fm_knn$fitted.values)
## 
##    0    1 
## 6325    2
fm_knn$fitted.values[545:615]
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [71] 0
## Levels: 0 1
rawData$onset[545:615]
##  [1] 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## [71] 0
ctab=table(Data$onset,fm_knn$fitted.values)

ctab
##    
##        0    1
##   0 6220    1
##   1  105    1
Yhat_kNN <- if_else( fm_knn$prob[,2]>.1, 1, 0)

ctab=table(Y,Yhat_kNN)
 
ctab
##    Yhat_kNN
## Y      0    1
##   0 6146   75
##   1   84   22
missClass=(sum(ctab)-sum(diag(ctab)))/sum(ctab)

perCW=ctab[2,2]/sum(ctab[,2])

cat("Missclassification and Civil War classification rate using KNN model are",round(missClass,2),"% and ",round(perCW,2),"%, respectively \n")
## Missclassification and Civil War classification rate using KNN model are 0.03 % and  0.23 %, respectively
kNNProbFit = data.frame(CivilWar=Data$onset,fm_knn$prob)

names(kNNProbFit)[2:3] = c("No", "Yes")

par(mfrow=c(1,2))
par(mai=c(1,1,.5,.5))
plot(No~CivilWar,kNNProbFit,col=c(grey(.5),2:3),cex.lab=2.4,cex.axis=2.4)
plot(Yes~CivilWar,kNNProbFit,col=c(grey(.5),2:3),cex.lab=2.4,cex.axis=2.4)

These plots presents an important issue about the quantitative models of Civil War onset. Similar to Fearon and Laitin (2003), these models are good in predicting peace, but not civil war/conflict onset!

Lift, ROC, and AUC

Below graph shows the different terms that are usually used for labeling different elements of a confusion matrix. Most of the classification measures are defined using these terms/labels.

Confusion matrix

Confusion matrix

Lift Curve

The lift curve is one of the commonly used measures of model classification performance. This measure shows how much model, that is \(\hat{p}=P(Y=1|x)\), is successful in capturing correct outcome, \(Y\).

To plot this curve, we first sort the fitting data based on \(\hat{p}\). In other words, we expect that the \(\hat{p}\) with the largest value be the one most likely to predict the correct realization of the outcome, i.e. True Positive. Then, we plot the percent observations taken vs the cumulative number of

library(devtools)
source_url('https://raw.githubusercontent.com/babakrezaee/DataWrangling/master/Functions/lfitPlot_fun.R')



olift = liftf(Data$onset,fm_binomial$fitted.values,dopl=FALSE)
olift2 = liftf(Data$onset,fm_knn$prob[,2] ,dopl=FALSE)

ii = (1:length(olift))/length(olift)

plot(ii,olift,type='n',lwd=2,xlab='% tried',ylab='% of successes',cex.lab=2)
lines(ii,olift,col='red')
lines(ii,olift2,col='blue')
abline(0,1,lty=2)

legend('bottomright',legend=c('logit','kNN'),col=c('red','blue'),lwd=3)

Sensitivity, Specificity, and Precision

\[\begin{align} Recall/Sensitivity/True~Positive~Rate=\frac{TP}{TP+FN} \end{align}\] \[\begin{align} Specificity/Selectivity/True~Negative~Rate=\frac{TN}{TN+FP} \end{align}\]
library(caret)
confusionMatrix(data = as.factor(Yhat_kNN), reference = as.factor(Y))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6146   84
##          1   75   22
##                                           
##                Accuracy : 0.9749          
##                  95% CI : (0.9707, 0.9786)
##     No Information Rate : 0.9832          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.204           
##  Mcnemar's Test P-Value : 0.5258          
##                                           
##             Sensitivity : 0.9879          
##             Specificity : 0.2075          
##          Pos Pred Value : 0.9865          
##          Neg Pred Value : 0.2268          
##              Prevalence : 0.9832          
##          Detection Rate : 0.9714          
##    Detection Prevalence : 0.9847          
##       Balanced Accuracy : 0.5977          
##                                           
##        'Positive' Class : 0               
## 
confusionMatrix(data = as.factor(Yhat_logit), reference = as.factor(Y))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6144   90
##          1   77   16
##                                           
##                Accuracy : 0.9736          
##                  95% CI : (0.9694, 0.9774)
##     No Information Rate : 0.9832          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.1475          
##  Mcnemar's Test P-Value : 0.3531          
##                                           
##             Sensitivity : 0.9876          
##             Specificity : 0.1509          
##          Pos Pred Value : 0.9856          
##          Neg Pred Value : 0.1720          
##              Prevalence : 0.9832          
##          Detection Rate : 0.9711          
##    Detection Prevalence : 0.9853          
##       Balanced Accuracy : 0.5693          
##                                           
##        'Positive' Class : 0               
## 
\[\begin{align} F=2\frac{Recall \times Precision}{Recall+Precision} \end{align}\]
confusionMatrix(data = as.factor(Yhat_kNN), reference = as.factor(Y), mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6146   84
##          1   75   22
##                                           
##                Accuracy : 0.9749          
##                  95% CI : (0.9707, 0.9786)
##     No Information Rate : 0.9832          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.204           
##  Mcnemar's Test P-Value : 0.5258          
##                                           
##               Precision : 0.9865          
##                  Recall : 0.9879          
##                      F1 : 0.9872          
##              Prevalence : 0.9832          
##          Detection Rate : 0.9714          
##    Detection Prevalence : 0.9847          
##       Balanced Accuracy : 0.5977          
##                                           
##        'Positive' Class : 0               
## 
confusionMatrix(data = as.factor(Yhat_logit), reference = as.factor(Y), mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6144   90
##          1   77   16
##                                           
##                Accuracy : 0.9736          
##                  95% CI : (0.9694, 0.9774)
##     No Information Rate : 0.9832          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.1475          
##  Mcnemar's Test P-Value : 0.3531          
##                                           
##               Precision : 0.9856          
##                  Recall : 0.9876          
##                      F1 : 0.9866          
##              Prevalence : 0.9832          
##          Detection Rate : 0.9711          
##    Detection Prevalence : 0.9853          
##       Balanced Accuracy : 0.5693          
##                                           
##        'Positive' Class : 0               
## 
library(pROC)

par(mfrow=c(1,2))
par(mai=c(1,1,.5,.5))



rocR = roc(response=Y,predictor=fm_binomial$fitted.values)
AUC = auc(rocR)
plot(rocR, col='maroon')
title(main=paste("logit model AUC= ",round(AUC,2)))

rocR = roc(response=Y,predictor=fm_knn$prob[,2])
AUC = auc(rocR)
plot(rocR, col='blue')
title(main=paste("kNN model AUC= ",round(AUC,2)))

How to decide about the threshold?

#source('ROC_calculate.R')
source_url('https://raw.githubusercontent.com/babakrezaee/MachineLearningAlgorithms/master/ClassificationMetrics/ROC_calculate.R')


#source('ROC_plot.R') 
source_url('https://raw.githubusercontent.com/babakrezaee/MachineLearningAlgorithms/master/ClassificationMetrics/ROC_plot.R')

results_table=data.frame(phat=fm_binomial$fitted.values,yobs=Y)
threshold=.2
roc <- ROC_calculate(results_table, 1, 1, n = 100)  
AUC=round(auc(results_table$yobs, results_table$phat),3)


ROC_plot(results_table,roc, .1, 1, 1)

results_table=data.frame(phat=fm_knn$prob[,2],yobs=Y)
threshold=.2
roc <- ROC_calculate(results_table, 1, 1, n = 100)  
AUC=round(auc(results_table$yobs, results_table$phat),3)


ROC_plot(results_table,roc, .1, 1, 1)


  1. Copyright 2019. This is an in-progress project, please do not cite or reproduce without my permission.