Unsupervised Learning with tidylearn

library(tidylearn)
library(dplyr)
library(ggplot2)

Introduction

This vignette explores unsupervised learning in tidylearn. All methods wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.

Wrapped packages include:

Access raw model objects via model$fit for package-specific functionality.

Dimensionality Reduction

Dimensionality reduction techniques help visualize high-dimensional data and extract key patterns.

Principal Component Analysis (PCA)

# Perform PCA on iris data (excluding species)
model_pca <- tl_model(iris[, 1:4], method = "pca")
print(model_pca)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: pca 
#> Technique: pca 
#> 
#> Training observations: 150
# Extract variance explained
variance_explained <- model_pca$fit$variance_explained
print(variance_explained)
#> # A tibble: 4 × 5
#>   component  sdev variance prop_variance cum_variance
#>   <chr>     <dbl>    <dbl>         <dbl>        <dbl>
#> 1 PC1       1.71    2.92         0.730          0.730
#> 2 PC2       0.956   0.914        0.229          0.958
#> 3 PC3       0.383   0.147        0.0367         0.995
#> 4 PC4       0.144   0.0207       0.00518        1
# Cumulative variance explained
cumsum(variance_explained$prop_variance)
#> [1] 0.7296245 0.9581321 0.9948213 1.0000000
# Transform data to principal components
pca_scores <- predict(model_pca)
head(pca_scores)
#> # A tibble: 6 × 5
#>   .obs_id   PC1    PC2     PC3      PC4
#>   <chr>   <dbl>  <dbl>   <dbl>    <dbl>
#> 1 1       -2.26 -0.478  0.127   0.0241 
#> 2 2       -2.07  0.672  0.234   0.103  
#> 3 3       -2.36  0.341 -0.0441  0.0283 
#> 4 4       -2.29  0.595 -0.0910 -0.0657 
#> 5 5       -2.38 -0.645 -0.0157 -0.0358 
#> 6 6       -2.07 -1.48  -0.0269  0.00659
# Visualize first two components
pca_plot_data <- pca_scores %>%
  mutate(Species = iris$Species)

ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "PCA of Iris Dataset",
    x = paste0("PC1 (", round(variance_explained$prop_variance[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained$prop_variance[2] * 100, 1), "%)")
  ) +
  theme_minimal()

# Examine loadings (variable contributions)
loadings <- model_pca$fit$loadings
print(loadings)
#> # A tibble: 4 × 5
#>   variable        PC1     PC2    PC3    PC4
#>   <chr>         <dbl>   <dbl>  <dbl>  <dbl>
#> 1 Sepal.Length  0.521 -0.377   0.720  0.261
#> 2 Sepal.Width  -0.269 -0.923  -0.244 -0.124
#> 3 Petal.Length  0.580 -0.0245 -0.142 -0.801
#> 4 Petal.Width   0.565 -0.0669 -0.634  0.524

Multidimensional Scaling (MDS)

# Perform MDS
model_mds <- tl_model(iris[, 1:4], method = "mds", k = 2)
print(model_mds)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: mds 
#> Technique: mds 
#> 
#> Training observations: 150
# Extract MDS coordinates
mds_points <- predict(model_mds)
head(mds_points)
#> # A tibble: 6 × 2
#>    Dim1   Dim2
#>   <dbl>  <dbl>
#> 1 -2.68  0.319
#> 2 -2.71 -0.177
#> 3 -2.89 -0.145
#> 4 -2.75 -0.318
#> 5 -2.73  0.327
#> 6 -2.28  0.741
# Visualize MDS
mds_plot_data <- mds_points %>%
  mutate(Species = iris$Species)

ggplot(mds_plot_data, aes(x = Dim1, y = Dim2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "MDS of Iris Dataset") +
  theme_minimal()

Clustering

Clustering algorithms group similar observations together without using labels.

K-means Clustering

# Perform k-means with k=3
model_kmeans <- tl_model(iris[, 1:4], method = "kmeans", k = 3)
print(model_kmeans)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: kmeans 
#> Technique: kmeans 
#> 
#> Training observations: 150
# Extract cluster assignments
clusters <- model_kmeans$fit$clusters
head(clusters)
#> # A tibble: 6 × 2
#>   .obs_id cluster
#>   <chr>     <int>
#> 1 1             3
#> 2 2             3
#> 3 3             3
#> 4 4             3
#> 5 5             3
#> 6 6             3
# Compare clusters with actual species
table(Cluster = clusters$cluster, Species = iris$Species)
#>        Species
#> Cluster setosa versicolor virginica
#>       1      0          2        36
#>       2      0         48        14
#>       3     50          0         0
# Visualize clusters using PCA
cluster_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters$cluster),
    Species = iris$Species
  )

ggplot(cluster_viz, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "K-means Clusters vs True Species") +
  theme_minimal()

# Access cluster centers
centers <- model_kmeans$fit$centers
print(centers)
#> # A tibble: 3 × 5
#>   cluster Sepal.Length Sepal.Width Petal.Length Petal.Width
#>     <int>        <dbl>       <dbl>        <dbl>       <dbl>
#> 1       1         6.85        3.07         5.74       2.07 
#> 2       2         5.90        2.75         4.39       1.43 
#> 3       3         5.01        3.43         1.46       0.246

PAM (K-medoids)

PAM is more robust to outliers than k-means:

# Perform PAM clustering
model_pam <- tl_model(iris[, 1:4], method = "pam", k = 3)
print(model_pam)

# Extract clusters
clusters_pam <- model_pam$fit$clusters
table(Cluster = clusters_pam$cluster, Species = iris$Species)

Hierarchical Clustering

# Perform hierarchical clustering
model_hclust <- tl_model(iris[, 1:4], method = "hclust")
print(model_hclust)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: hclust 
#> Technique: hclust 
#> 
#> Training observations: 150
# Plot dendrogram
plot(model_hclust$fit$model, labels = FALSE, main = "Hierarchical Clustering of Iris")

# Cut tree to get clusters
k <- 3
clusters_hc <- cutree(model_hclust$fit$model, k = k)
table(Cluster = clusters_hc, Species = iris$Species)
#>        Species
#> Cluster setosa versicolor virginica
#>       1     50          0         0
#>       2      0         50        14
#>       3      0          0        36
# Visualize hierarchical clusters
hc_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters_hc),
    Species = iris$Species
  )

ggplot(hc_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Hierarchical Clustering Results") +
  theme_minimal()

DBSCAN (Density-Based Clustering)

DBSCAN can find arbitrarily shaped clusters and identify outliers:

# Perform DBSCAN
model_dbscan <- tl_model(iris[, 1:4], method = "dbscan", eps = 0.5, minPts = 5)
print(model_dbscan)

# Extract clusters (0 = noise/outliers)
clusters_dbscan <- model_dbscan$fit$clusters
table(clusters_dbscan$cluster)

# Compare with species
table(Cluster = clusters_dbscan$cluster, Species = iris$Species)

CLARA (for Large Datasets)

CLARA is efficient for large datasets:

# Create larger dataset
large_data <- iris[rep(1:nrow(iris), 10), 1:4]

# Perform CLARA
model_clara <- tl_model(large_data, method = "clara", k = 3, samples = 5)
print(model_clara)

# Extract clusters
clusters_clara <- model_clara$fit$clusters

Choosing the Number of Clusters

Elbow Method

# Try different values of k
k_values <- 2:8
within_ss <- numeric(length(k_values))

for (i in seq_along(k_values)) {
  k <- k_values[i]
  model <- tl_model(iris[, 1:4], method = "kmeans", k = k)
  within_ss[i] <- model$fit$model$tot.withinss
}

# Plot elbow curve
elbow_data <- data.frame(k = k_values, within_ss = within_ss)

ggplot(elbow_data, aes(x = k, y = within_ss)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  labs(
    title = "Elbow Method for Optimal k",
    x = "Number of Clusters (k)",
    y = "Total Within-Cluster Sum of Squares"
  ) +
  theme_minimal()

Predicting on New Data

Clustering New Observations

# Train clustering model
model_train <- tl_model(iris[1:100, 1:4], method = "kmeans", k = 3)

# Predict cluster assignments for new data
new_data <- iris[101:150, 1:4]
new_clusters <- predict(model_train, new_data = new_data)

head(new_clusters)
#> # A tibble: 6 × 1
#>   cluster
#>     <int>
#> 1       3
#> 2       3
#> 3       3
#> 4       3
#> 5       3
#> 6       3

Transforming New Data with PCA

# Train PCA model
pca_train <- tl_model(iris[1:100, 1:4], method = "pca")

# Transform new data
new_pca <- predict(pca_train, new_data = new_data)
head(new_pca)
#> # A tibble: 6 × 5
#>   .obs_id   PC1    PC2     PC3    PC4
#>   <chr>   <dbl>  <dbl>   <dbl>  <dbl>
#> 1 1        3.38 -1.28  -1.59    0.332
#> 2 2        2.53  0.265 -0.849   0.167
#> 3 3        3.78 -1.41  -0.0897  0.150
#> 4 4        2.85 -0.514 -0.495  -0.151
#> 5 5        3.39 -0.899 -0.797   0.201
#> 6 6        4.42 -1.85   0.295  -0.128

Combining Multiple Techniques

PCA followed by Clustering

# Reduce dimensions with PCA
pca_model <- tl_model(iris[, 1:4], method = "pca")
pca_data <- predict(pca_model)

# Select first 2 components
pca_reduced <- pca_data %>% select(PC1, PC2)

# Cluster in reduced space
kmeans_pca <- tl_model(pca_reduced, method = "kmeans", k = 3)
clusters_pca <- kmeans_pca$fit$clusters

# Visualize
viz_combined <- pca_data %>%
  mutate(
    Cluster = as.factor(clusters_pca$cluster),
    Species = iris$Species
  )

ggplot(viz_combined, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Clustering in PCA Space") +
  theme_minimal()

Practical Applications

Customer Segmentation

# Simulate customer data
set.seed(42)
customers <- data.frame(
  age = rnorm(200, 40, 15),
  income = rnorm(200, 50000, 20000),
  spending_score = rnorm(200, 50, 25)
)

# Standardize features
customers_scaled <- scale(customers) %>% as.data.frame()

# Cluster customers
customer_segments <- tl_model(customers_scaled, method = "kmeans", k = 4)
customers$segment <- customer_segments$fit$clusters$cluster

# Visualize segments
ggplot(customers, aes(x = income, y = spending_score, color = as.factor(segment))) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Customer Segmentation",
    color = "Segment"
  ) +
  theme_minimal()

Feature Extraction

# Use PCA for feature extraction
pca_features <- tl_model(mtcars, method = "pca")

# Keep components explaining 90% of variance
var_exp <- pca_features$fit$variance_explained
cumulative_var <- cumsum(var_exp$prop_variance)
n_components <- which(cumulative_var >= 0.90)[1]

cat("Components needed for 90% variance:", n_components, "\n")
#> Components needed for 90% variance: 4
cat("Original features:", ncol(mtcars), "\n")
#> Original features: 11
cat("Dimension reduction:", round((1 - n_components/ncol(mtcars)) * 100, 1), "%\n")
#> Dimension reduction: 63.6 %

Best Practices

  1. Scale your data before clustering or PCA for fair feature comparison
  2. Determine optimal k using elbow method or silhouette analysis
  3. Try multiple methods - different algorithms work better for different data
  4. Visualize results to understand cluster structure
  5. Consider domain knowledge when interpreting clusters
  6. Use PCA for visualization when data has more than 2-3 dimensions

Summary

tidylearn provides comprehensive unsupervised learning tools:

# Complete unsupervised workflow
workflow_data <- iris[, 1:4]

# 1. Reduce dimensions
pca_final <- tl_model(workflow_data, method = "pca")

# 2. Cluster in reduced space
pca_coords <- predict(pca_final) %>% select(PC1, PC2)
clusters_final <- tl_model(pca_coords, method = "kmeans", k = 3)

# 3. Visualize
final_viz <- pca_coords %>%
  mutate(
    Cluster = as.factor(clusters_final$fit$clusters$cluster),
    Species = iris$Species
  )

ggplot(final_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Complete Unsupervised Workflow") +
  theme_minimal()

mirror server hosted at Truenetwork, Russian Federation.