Supervised Learning with tidylearn

Introduction

This vignette demonstrates supervised learning capabilities in tidylearn. All methods shown here wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.

Wrapped packages include:

stats (lm(), glm()) for linear and logistic regression
rpart for decision trees
randomForest for random forests
gbm and xgboost for gradient boosting
glmnet for regularization (ridge, lasso, elastic net)
e1071 for support vector machines
nnet for neural networks

Access raw model objects via model$fit for package-specific functionality.

Classification

Binary Classification

Let’s create a binary classification problem from the iris dataset:

# Create binary classification dataset
iris_binary <- iris %>%
  filter(Species %in% c("setosa", "versicolor")) %>%
  mutate(Species = droplevels(Species))

# Split data
split <- tl_split(iris_binary, prop = 0.7, stratify = "Species", seed = 123)

Logistic Regression

# Train logistic regression
model_logistic <- tl_model(split$train, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model_logistic)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: logistic 
#> Task: Classification 
#> Formula: Species ~ . 
#> 
#> Training observations: 70

# Predictions
preds_logistic <- predict(model_logistic, new_data = split$test)
head(preds_logistic)
#> # A tibble: 6 × 1
#>      .pred
#>      <dbl>
#> 1 2.22e-16
#> 2 2.22e-16
#> 3 2.22e-16
#> 4 2.22e-16
#> 5 2.22e-16
#> 6 2.22e-16

Decision Trees

# Train decision tree
model_tree <- tl_model(split$train, Species ~ ., method = "tree")
print(model_tree)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: tree 
#> Task: Classification 
#> Formula: Species ~ . 
#> 
#> Training observations: 70

# Predictions
preds_tree <- predict(model_tree, new_data = split$test)

Multi-class Classification

# Split full iris dataset
split_multi <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)

Random Forest

# Train random forest
model_forest <- tl_model(split_multi$train, Species ~ ., method = "forest")
print(model_forest)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: forest 
#> Task: Classification 
#> Formula: Species ~ . 
#> 
#> Training observations: 105

# Predictions
preds_forest <- predict(model_forest, new_data = split_multi$test)
head(preds_forest)
#> # A tibble: 6 × 1
#>   .pred 
#>   <fct> 
#> 1 setosa
#> 2 setosa
#> 3 setosa
#> 4 setosa
#> 5 setosa
#> 6 setosa

# Accuracy on test set
mean(preds_forest$.pred == split_multi$test$Species)
#> [1] 0.9333333

Support Vector Machines

# Train SVM
model_svm <- tl_model(split_multi$train, Species ~ ., method = "svm")
print(model_svm)

# Predictions
preds_svm <- predict(model_svm, new_data = split_multi$test)

Regression

Linear Regression

# Split mtcars data
split_reg <- tl_split(mtcars, prop = 0.7, seed = 123)

# Train linear model
model_lm <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")
print(model_lm)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: linear 
#> Task: Regression 
#> Formula: mpg ~ wt + hp + disp 
#> 
#> Training observations: 22

# Predictions
preds_lm <- predict(model_lm, new_data = split_reg$test)
head(preds_lm)
#> # A tibble: 6 × 1
#>   .pred
#>   <dbl>
#> 1  24.0
#> 2  23.1
#> 3  21.2
#> 4  20.7
#> 5  16.0
#> 6  17.2

# Calculate RMSE
rmse <- sqrt(mean((preds_lm$.pred - split_reg$test$mpg)^2))
cat("RMSE:", round(rmse, 2), "\n")
#> RMSE: 2.16

Polynomial Regression

# Polynomial regression for non-linear relationships
model_poly <- tl_model(split_reg$train, mpg ~ wt, method = "polynomial", degree = 2)
print(model_poly)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: polynomial 
#> Task: Regression 
#> Formula: mpg ~ wt 
#> 
#> Training observations: 22

# Predictions
preds_poly <- predict(model_poly, new_data = split_reg$test)

# RMSE
rmse_poly <- sqrt(mean((preds_poly$.pred - split_reg$test$mpg)^2))
cat("Polynomial RMSE:", round(rmse_poly, 2), "\n")
#> Polynomial RMSE: 2.09

Random Forest Regression

# Train random forest for regression
model_rf_reg <- tl_model(split_reg$train, mpg ~ ., method = "forest")
print(model_rf_reg)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: forest 
#> Task: Regression 
#> Formula: mpg ~ . 
#> 
#> Training observations: 22

# Predictions
preds_rf <- predict(model_rf_reg, new_data = split_reg$test)

# RMSE
rmse_rf <- sqrt(mean((preds_rf$.pred - split_reg$test$mpg)^2))
cat("Random Forest RMSE:", round(rmse_rf, 2), "\n")
#> Random Forest RMSE: 1.97

Regularized Regression

Regularization helps prevent overfitting by adding penalties to model complexity.

Ridge Regression

# Ridge regression (L2 regularization)
model_ridge <- tl_model(split_reg$train, mpg ~ ., method = "ridge")
print(model_ridge)

# Predictions
preds_ridge <- predict(model_ridge, new_data = split_reg$test)

LASSO

# LASSO (L1 regularization) - performs feature selection
model_lasso <- tl_model(split_reg$train, mpg ~ ., method = "lasso")
print(model_lasso)

# Predictions
preds_lasso <- predict(model_lasso, new_data = split_reg$test)

Elastic Net

# Elastic Net - combines L1 and L2 regularization
model_enet <- tl_model(split_reg$train, mpg ~ ., method = "elastic_net", alpha = 0.5)
print(model_enet)

# Predictions
preds_enet <- predict(model_enet, new_data = split_reg$test)

Model Comparison

# Compare multiple models
models <- list(
  linear = tl_model(split_reg$train, mpg ~ ., method = "linear"),
  tree = tl_model(split_reg$train, mpg ~ ., method = "tree"),
  forest = tl_model(split_reg$train, mpg ~ ., method = "forest")
)

# Calculate RMSE for each model
results <- data.frame(
  Model = character(),
  RMSE = numeric(),
  stringsAsFactors = FALSE
)

for (model_name in names(models)) {
  preds <- predict(models[[model_name]], new_data = split_reg$test)
  rmse <- sqrt(mean((preds$.pred - split_reg$test$mpg)^2))

  results <- rbind(results, data.frame(
    Model = model_name,
    RMSE = rmse
  ))
}

results <- results %>% arrange(RMSE)
print(results)
#>    Model     RMSE
#> 1 forest 2.046967
#> 2 linear 2.281450
#> 3   tree 4.095888

Advanced Features

Using Preprocessed Data

# Preprocess data
processed <- tl_prepare_data(
  split_reg$train,
  mpg ~ .,
  scale_method = "standardize",
  remove_correlated = TRUE,
  correlation_cutoff = 0.9
)
#> Removing 1 highly correlated features
#> Scaling numeric features using method: standardize

# Train on preprocessed data
model_processed <- tl_model(processed$data, mpg ~ ., method = "linear")
print(model_processed)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: linear 
#> Task: Regression 
#> Formula: mpg ~ . 
#> 
#> Training observations: 22

Formula Variations

# Interaction terms
model_interact <- tl_model(split_reg$train, mpg ~ wt * hp, method = "linear")

# Polynomial terms using I()
model_poly_manual <- tl_model(split_reg$train, mpg ~ wt + I(wt^2), method = "linear")

# Subset of predictors
model_subset <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")

Handling Different Data Types

Categorical Predictors

# Create dataset with categorical variables
mtcars_cat <- mtcars %>%
  mutate(
    cyl = as.factor(cyl),
    gear = as.factor(gear),
    am = as.factor(am)
  )

split_cat <- tl_split(mtcars_cat, prop = 0.7, seed = 123)

# Model with categorical predictors
model_cat <- tl_model(split_cat$train, mpg ~ ., method = "forest")
print(model_cat)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: forest 
#> Task: Regression 
#> Formula: mpg ~ . 
#> 
#> Training observations: 22

Missing Values

# Create data with missing values
mtcars_missing <- mtcars
mtcars_missing[sample(1:nrow(mtcars_missing), 5), "hp"] <- NA
mtcars_missing[sample(1:nrow(mtcars_missing), 3), "wt"] <- NA

# Preprocess to handle missing values
processed_missing <- tl_prepare_data(
  mtcars_missing,
  mpg ~ .,
  impute_method = "mean",
  scale_method = "standardize"
)
#> Imputing missing values using method: mean
#> Scaling numeric features using method: standardize

# Train model
model_imputed <- tl_model(processed_missing$data, mpg ~ ., method = "linear")

Best Practices

Always split your data before training to properly evaluate performance
Use stratified splitting for classification to maintain class proportions
Preprocess your data for better model performance
Compare multiple models to find the best approach
Consider regularization when dealing with many predictors
Use appropriate metrics - accuracy for classification, RMSE/MAE for regression

Summary

tidylearn provides a unified interface for supervised learning:

Classification: Logistic regression, decision trees, random forests, SVM, etc.
Regression: Linear, polynomial, random forests, regularized methods
Preprocessing: Integrated data preparation tools
Consistent API: Same function (tl_model()) for all methods
Tidy Output: Easy-to-use predictions and model objects

# Complete workflow example
final_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)
final_prep <- tl_prepare_data(final_split$train, Species ~ ., scale_method = "standardize")
#> Scaling numeric features using method: standardize
final_model <- tl_model(final_prep$data, Species ~ ., method = "forest")
final_preds <- predict(final_model, new_data = final_split$test)

# Evaluate
accuracy <- mean(final_preds$.pred == final_split$test$Species)
cat("Test Accuracy:", round(accuracy * 100, 1), "%\n")
#> Test Accuracy: 33.3 %