library(ale)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
This vignette demonstrates how ale
works for various
datatypes of input (x) values. You should first read the introductory
vignette that explains general functionality of the package; this
vignette is a demonstration of specific functionality.
mtcars
dataset (Motor Trend Car Road
Tests)For this demonstration, we will modify the built-in
mtcars
dataset so that it has binary (logical), multinomial
(factor, that is, non-ordered categories), ordinal (ordered factor),
discrete interval (integer), and continuous interval (numeric or double)
values. This will let us test all the different basic variations of x
variables.
The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).
A data frame with 32 observations on 11 (numeric) variables.
Variable | Description | |
---|---|---|
[, 1] | mpg | Miles/(US) gallon |
[, 2] | cyl | Number of cylinders |
[, 3] | disp | Displacement (cu.in.) |
[, 4] | hp | Gross horsepower |
[, 5] | drat | Rear axle ratio |
[, 6] | wt | Weight (1000 lbs) |
[, 7] | qsec | 1/4 mile time |
[, 8] | vs | Engine (0 = V-shaped, 1 = straight) |
[, 9] | am | Transmission (0 = automatic, 1 = manual) |
[,10] | gear | Number of forward gears |
[,11] | carb | Number of carburetors |
Henderson and Velleman (1981) comment in a footnote to Table 1: ‘Hocking [original transcriber]’s noncrucial coding of the Mazda’s rotary engine as a straight six-cylinder engine and the Porsche’s flat engine as a V engine, as well as the inclusion of the diesel Mercedes 240D, have been retained to enable direct comparisons to be made with previous analyses.’
Henderson and Velleman (1981), Building multiple regression models interactively. Biometrics, 37, 391–411.
cars
To get the multinomial variable, we will adapt information from the
names of the cars which is available in the row names of the
mtcars
cases. However, since each is unique, we will
determine the country of the manufacturer of each car; this gives us a
factor of just six categories. With such a small dataset of only 32
rows, we cannot work with many more categories than that or else the
models would end up with too many dummy variables for such a small
dataset.
Our adapted dataset will be called cars
; it is the same
data as mtcars
but with several variables encoded as more
diverse yet appropriate datatypes.
# Create a function to determine the country of origin of a car based on its make
car_country <- function(make) {
american_makes <- c("AMC", "Cadillac", "Camaro", "Chrysler", "Dodge", "Duster", "Ford", "Hornet", "Lincoln", "Pontiac", "Valiant")
japanese_makes <- c("Datsun", "Honda", "Mazda", "Toyota")
italian_makes <- c("Ferrari", "Fiat", "Maserati")
british_makes <- c("Lotus")
swedish_makes <- c("Volvo")
german_makes <- c("Merc", "Porsche")
case_when(
make %in% american_makes ~ 'USA',
make %in% japanese_makes ~ 'Japan',
make %in% italian_makes ~ 'Italy',
make %in% british_makes ~ 'UK',
make %in% swedish_makes ~ 'Sweden',
make %in% german_makes ~ 'Germany',
)
}
cars <-
mtcars |>
as_tibble(rownames = 'make') |>
# retain only first word as the make without the car model
mutate(
make = stringr::str_extract(make, "^\\S+") |> factor(),
country = car_country(make) |> factor()
) |>
select(-make) |>
mutate(across(c(vs, am), as.logical)) |>
mutate(gear = as.ordered(gear)) |>
mutate(across(c(cyl, carb), as.integer))
cars |>
print(n = 50)
#> # A tibble: 32 × 12
#> mpg cyl disp hp drat wt qsec vs am gear carb country
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <ord> <int> <fct>
#> 1 21 6 160 110 3.9 2.62 16.5 FALSE TRUE 4 4 Japan
#> 2 21 6 160 110 3.9 2.88 17.0 FALSE TRUE 4 4 Japan
#> 3 22.8 4 108 93 3.85 2.32 18.6 TRUE TRUE 4 1 Japan
#> 4 21.4 6 258 110 3.08 3.22 19.4 TRUE FALSE 3 1 USA
#> 5 18.7 8 360 175 3.15 3.44 17.0 FALSE FALSE 3 2 USA
#> 6 18.1 6 225 105 2.76 3.46 20.2 TRUE FALSE 3 1 USA
#> 7 14.3 8 360 245 3.21 3.57 15.8 FALSE FALSE 3 4 USA
#> 8 24.4 4 147. 62 3.69 3.19 20 TRUE FALSE 4 2 Germany
#> 9 22.8 4 141. 95 3.92 3.15 22.9 TRUE FALSE 4 2 Germany
#> 10 19.2 6 168. 123 3.92 3.44 18.3 TRUE FALSE 4 4 Germany
#> 11 17.8 6 168. 123 3.92 3.44 18.9 TRUE FALSE 4 4 Germany
#> 12 16.4 8 276. 180 3.07 4.07 17.4 FALSE FALSE 3 3 Germany
#> 13 17.3 8 276. 180 3.07 3.73 17.6 FALSE FALSE 3 3 Germany
#> 14 15.2 8 276. 180 3.07 3.78 18 FALSE FALSE 3 3 Germany
#> 15 10.4 8 472 205 2.93 5.25 18.0 FALSE FALSE 3 4 USA
#> 16 10.4 8 460 215 3 5.42 17.8 FALSE FALSE 3 4 USA
#> 17 14.7 8 440 230 3.23 5.34 17.4 FALSE FALSE 3 4 USA
#> 18 32.4 4 78.7 66 4.08 2.2 19.5 TRUE TRUE 4 1 Italy
#> 19 30.4 4 75.7 52 4.93 1.62 18.5 TRUE TRUE 4 2 Japan
#> 20 33.9 4 71.1 65 4.22 1.84 19.9 TRUE TRUE 4 1 Japan
#> 21 21.5 4 120. 97 3.7 2.46 20.0 TRUE FALSE 3 1 Japan
#> 22 15.5 8 318 150 2.76 3.52 16.9 FALSE FALSE 3 2 USA
#> 23 15.2 8 304 150 3.15 3.44 17.3 FALSE FALSE 3 2 USA
#> 24 13.3 8 350 245 3.73 3.84 15.4 FALSE FALSE 3 4 USA
#> 25 19.2 8 400 175 3.08 3.84 17.0 FALSE FALSE 3 2 USA
#> 26 27.3 4 79 66 4.08 1.94 18.9 TRUE TRUE 4 1 Italy
#> 27 26 4 120. 91 4.43 2.14 16.7 FALSE TRUE 5 2 Germany
#> 28 30.4 4 95.1 113 3.77 1.51 16.9 TRUE TRUE 5 2 UK
#> 29 15.8 8 351 264 4.22 3.17 14.5 FALSE TRUE 5 4 USA
#> 30 19.7 6 145 175 3.62 2.77 15.5 FALSE TRUE 5 6 Italy
#> 31 15 8 301 335 3.54 3.57 14.6 FALSE TRUE 5 8 Italy
#> 32 21.4 4 121 109 4.11 2.78 18.6 TRUE TRUE 4 2 Sweden
summary(cars)
#> mpg cyl disp hp
#> Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
#> 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
#> Median :19.20 Median :6.000 Median :196.3 Median :123.0
#> Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
#> 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
#> Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
#> drat wt qsec vs
#> Min. :2.760 Min. :1.513 Min. :14.50 Mode :logical
#> 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 FALSE:18
#> Median :3.695 Median :3.325 Median :17.71 TRUE :14
#> Mean :3.597 Mean :3.217 Mean :17.85
#> 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90
#> Max. :4.930 Max. :5.424 Max. :22.90
#> am gear carb country
#> Mode :logical 3:15 Min. :1.000 Germany: 8
#> FALSE:19 4:12 1st Qu.:2.000 Italy : 4
#> TRUE :13 5: 5 Median :2.000 Japan : 6
#> Mean :2.812 Sweden : 1
#> 3rd Qu.:4.000 UK : 1
#> Max. :8.000 USA :12
With GAM, only numeric variables can be smoothed, not binary or
categorical ones. However, smoothing does not always help improve the
model since some variables are not related to the outcome and some that
are related actually do have a simple linear relationship. To keep this
demonstration simple, we have done some earlier analysis (not shown
here) that determines where smoothing is worthwhile on the modified
cars
dataset, so only some of the numeric variables are
smoothed. Our goal here is not to demonstrate the best modelling
procedure but rather to demonstrate the flexibility of the
ale
package.
cm <- mgcv::gam(mpg ~ cyl + s(disp) + s(hp) + drat + wt + s(qsec) +
+ vs + am + gear + carb + country,
data = cars)
summary(cm)
#>
#> Family: gaussian
#> Link function: identity
#>
#> Formula:
#> mpg ~ cyl + s(disp) + s(hp) + drat + wt + s(qsec) + +vs + am +
#> gear + carb + country
#>
#> Parametric coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -7.962 13.040 -0.611 0.5698
#> cyl 3.266 1.674 1.951 0.1121
#> drat 4.315 1.562 2.762 0.0425 *
#> wt -2.546 1.382 -1.843 0.1284
#> vsTRUE 13.613 2.342 5.812 0.0026 **
#> amTRUE 5.402 2.910 1.856 0.1262
#> gear.L 7.100 4.520 1.571 0.1806
#> gear.Q 1.966 1.139 1.726 0.1486
#> carb -1.424 1.014 -1.404 0.2228
#> countryItaly 2.671 2.944 0.907 0.4084
#> countryJapan 1.749 3.103 0.564 0.5987
#> countrySweden 2.148 3.998 0.537 0.6155
#> countryUK -5.307 3.194 -1.661 0.1612
#> countryUSA -6.727 2.232 -3.014 0.0320 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Approximate significance of smooth terms:
#> edf Ref.df F p-value
#> s(disp) 1.771 2.055 3.090 0.13243
#> s(hp) 3.143 3.505 1.083 0.35086
#> s(qsec) 8.380 8.791 14.187 0.00503 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> R-sq.(adj) = 0.986 Deviance explained = 99.8%
#> GCV = 3.5064 Scale est. = 0.51562 n = 32
Now we generate ALE data from the cars
GAM model and
plot it.
cars_ale <- ale(cars, cm)
# Print all plots
# Skip .common_data when iterating through the data for plotting
cars_ale[setdiff(names(cars_ale), '.common_data')] |>
purrr::map(\(.x) .x$plot) |> # extract plots as a list
gridExtra::grid.arrange(grobs = _, ncol = 2)
We can see that
ale
has no trouble modelling any of the
datatypes in our sample (logical, factor, ordered, integer, or
double).
We can also generate and plot the ALE data for all two-way interactions.
cars_ale_ixn <- ale_ixn(cars, cm)
# Skip .common_data when iterating through the data for plotting
cars_ale_ixn[setdiff(names(cars_ale_ixn), '.common_data')] |>
purrr::walk(\(x1) { # extract list of x1 ALE outputs
purrr::map(x1, \(.x) .x$plot) |> # for each x1, extract list of x2 ALE outputs
gridExtra::grid.arrange(grobs = _, ncol = 2) # plot all x1 plots
})
There are no interactions in this dataset.
Finally, as explained in the vignette on modelling with small datasets, a more appropriate modelling workflow would require bootstrapping the entire model, not just the ALE data. So, let’s do that now.
mb <- model_bootstrap(
cars,
'mgcv::gam(mpg ~ cyl + s(disp) + s(hp) + drat + wt + s(qsec) +
+ vs + am + gear + carb + country)'
)
# Skip .common_data when iterating through the data for plotting
mb$ale_data[setdiff(names(mb$ale_data), '.common_data')] |>
purrr::map(\(.x) .x$plot) |> # extract plots as a list
gridExtra::grid.arrange(grobs = _, ncol = 2)