Precision-Recall and Receiver Operator Characteristics Curves

library(SLmetrics)

In this vignette a worked example on creating Precision-Recall and Receiver Operator Characteristics curves are provided. Throughout this vignette the Banknote Authentication-dataset is used. The banknote-dataset is a part of {SLmetrics} which is list of features and targets, and can be called as follows:

# 1) load data into namespace
data(
    banknote,
    package = "SLmetrics"
)

The banknote dataset classification tasks achieves between 95% and 99% accuracy and therefore makes a bad case for demonstrating Precision-Recall and Receiver Operator Characteristics curves. To alleviate this, random noise will be injected to the original dataset as follows:

# 1) set seed
set.seed(1903)

# 2) extract indices
# for shuffling
noise <- sample(
    x = 1:nrow(banknote$features),
    size = nrow(banknote$features) * 0.50
)

# 3) reshuffle
# features and target
noise <- cbind(
    banknote$features[sample(noise),],
    target = banknote$target[sample(noise)]
)

The data.frame is constructed as follows:

# 1) convert to data.frame
# and head
head(
    banknote <- cbind(
        banknote$features,
        target = banknote$target
        )
)
#>   variance skewness curtosis  entropy    target
#> 1  3.62160   8.6661  -2.8073 -0.44699 authentic
#> 2  4.54590   8.1674  -2.4586 -1.46210 authentic
#> 3  3.86600  -2.6383   1.9242  0.10645 authentic
#> 4  3.45660   9.5228  -4.0112 -3.59440 authentic
#> 5  0.32924  -4.4552   4.5718 -0.98880 authentic
#> 6  4.36840   9.6718  -3.9606 -3.16250 authentic

# 2) introduce random
# noise to the data
# NOTE: wrapped in `try()` in case 
# noise is removed
try(
    expr = {
        banknote <- rbind(
        banknote,
        noise
    )
    },
    silent = TRUE
)

# 3) convert target to binary
# value
banknote$target <- as.numeric(
    banknote$target == "inauthentic"
)

Authentic or inauthentic banknote

To predict whether the banknotes are authentic or inauthentic a logistic regression will be trained on a training sample, and evaluated on a the test sample.

Training/Test split

To train and test test the model a training/test split with 80% and 20%.

# 1) set seed
set.seed(1903)

# 2) generate indices
index <- sample(
    x = 1:nrow(banknote),
    size = nrow(banknote) * 0.80
)

# 3) split data
# 3.1) training
train <- banknote[index,]
test  <- banknote[-index,]

Training the logistic regression

# 1) train the logistic
# regression
model <- glm(
    formula = target ~ .,
    data    = train,
    family  = binomial(
        link = "logit"
    ) 
)

Evaluate Performance

To evaluate the performance we will extract the response probabilities

# 1) extract class
# probabilites
class_probabilities <- predict(
    object  = model,
    newdata = subset(test, select = -target),
    type    = "response"
)

# 2) calculate class
class_probabilities <- as.matrix(
    cbind(
        class_probabilities,
        1 - class_probabilities
    )
)

Visualize Precision-Recall Curve

# 1) create actual
# value
actual <- factor(
    x = test$target,
    levels = c(1, 0),
    labels = c("inauthentic", "authentic")
)
# 1) construct precision-recall 
# object
print(
    precision_recall <- prROC(
        actual   = actual,
        response = class_probabilities
    )
)
#>    threshold level       label  recall precision
#> 1        Inf     1 inauthentic 0.00000     1.000
#> 2      0.919     1 inauthentic 0.00535     1.000
#> 3      0.917     1 inauthentic 0.01070     1.000
#> 4      0.909     1 inauthentic 0.01604     1.000
#> 5      0.906     1 inauthentic 0.02139     1.000
#> 6      0.903     1 inauthentic 0.02674     1.000
#> 7      0.901     1 inauthentic 0.03209     1.000
#> 8      0.898     1 inauthentic 0.03209     0.857
#> 9      0.898     1 inauthentic 0.03743     0.875
#> 10     0.895     1 inauthentic 0.04278     0.889
#>  [ reached 'max' / getOption("max.print") -- omitted 816 rows ]

The Precision-Recall object can be visualized by using plot()

plot(
    precision_recall
)

pr.auc(
    actual   = actual,
    response = class_probabilities
)
#> inauthentic   authentic 
#>   0.7961331   0.8265086

Visualize Receiver Operator Characteristics Curve

# 1) construct Receiver Operator Characteristics 
# object
print(
    receiver_operator_characteristics <- ROC(
        actual   = actual,
        response = class_probabilities
    )
)
#>    threshold level       label     tpr     fpr
#> 1        Inf     1 inauthentic 0.00000 0.00000
#> 2      0.919     1 inauthentic 0.00535 0.00000
#> 3      0.917     1 inauthentic 0.01070 0.00000
#> 4      0.909     1 inauthentic 0.01604 0.00000
#> 5      0.906     1 inauthentic 0.02139 0.00000
#> 6      0.903     1 inauthentic 0.02674 0.00000
#> 7      0.901     1 inauthentic 0.03209 0.00000
#> 8      0.898     1 inauthentic 0.03209 0.00444
#> 9      0.898     1 inauthentic 0.03743 0.00444
#> 10     0.895     1 inauthentic 0.04278 0.00444
#>  [ reached 'max' / getOption("max.print") -- omitted 816 rows ]

The Receiver Operator Characteristics object can be visualized by using plot()

plot(
    receiver_operator_characteristics
)

roc.auc(
    actual   = actual,
    response = class_probabilities
)
#> inauthentic   authentic 
#>   0.8464171   0.8464409

mirror server hosted at Truenetwork, Russian Federation.