Unsupervised Learning with tidylearn

Introduction

This vignette explores unsupervised learning in tidylearn. All methods wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.

Wrapped packages include:

stats (prcomp(), kmeans(), hclust(), cmdscale())
cluster (pam(), clara())
dbscan for density-based clustering
MASS (isoMDS(), sammon())
smacof for MDS algorithms

Access raw model objects via model$fit for package-specific functionality.

Dimensionality Reduction

Dimensionality reduction techniques help visualize high-dimensional data and extract key patterns.

Principal Component Analysis (PCA)

# Perform PCA on iris data (excluding species)
model_pca <- tl_model(iris[, 1:4], method = "pca")
print(model_pca)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: pca 
#> Technique: pca 
#> 
#> Training observations: 150

# Extract variance explained
variance_explained <- model_pca$fit$variance_explained
print(variance_explained)
#> # A tibble: 4 × 5
#>   component  sdev variance prop_variance cum_variance
#>   <chr>     <dbl>    <dbl>         <dbl>        <dbl>
#> 1 PC1       1.71    2.92         0.730          0.730
#> 2 PC2       0.956   0.914        0.229          0.958
#> 3 PC3       0.383   0.147        0.0367         0.995
#> 4 PC4       0.144   0.0207       0.00518        1

# Cumulative variance explained
cumsum(variance_explained$prop_variance)
#> [1] 0.7296245 0.9581321 0.9948213 1.0000000

# Transform data to principal components
pca_scores <- predict(model_pca)
head(pca_scores)
#> # A tibble: 6 × 5
#>   .obs_id   PC1    PC2     PC3      PC4
#>   <chr>   <dbl>  <dbl>   <dbl>    <dbl>
#> 1 1       -2.26 -0.478  0.127   0.0241 
#> 2 2       -2.07  0.672  0.234   0.103  
#> 3 3       -2.36  0.341 -0.0441  0.0283 
#> 4 4       -2.29  0.595 -0.0910 -0.0657 
#> 5 5       -2.38 -0.645 -0.0157 -0.0358 
#> 6 6       -2.07 -1.48  -0.0269  0.00659

# Visualize first two components
pca_plot_data <- pca_scores %>%
  mutate(Species = iris$Species)

ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "PCA of Iris Dataset",
    x = paste0("PC1 (", round(variance_explained$prop_variance[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained$prop_variance[2] * 100, 1), "%)")
  ) +
  theme_minimal()

# Examine loadings (variable contributions)
loadings <- model_pca$fit$loadings
print(loadings)
#> # A tibble: 4 × 5
#>   variable        PC1     PC2    PC3    PC4
#>   <chr>         <dbl>   <dbl>  <dbl>  <dbl>
#> 1 Sepal.Length  0.521 -0.377   0.720  0.261
#> 2 Sepal.Width  -0.269 -0.923  -0.244 -0.124
#> 3 Petal.Length  0.580 -0.0245 -0.142 -0.801
#> 4 Petal.Width   0.565 -0.0669 -0.634  0.524

Multidimensional Scaling (MDS)

# Perform MDS
model_mds <- tl_model(iris[, 1:4], method = "mds", k = 2)
print(model_mds)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: mds 
#> Technique: mds 
#> 
#> Training observations: 150

# Extract MDS coordinates
mds_points <- predict(model_mds)
head(mds_points)
#> # A tibble: 6 × 2
#>    Dim1   Dim2
#>   <dbl>  <dbl>
#> 1 -2.68  0.319
#> 2 -2.71 -0.177
#> 3 -2.89 -0.145
#> 4 -2.75 -0.318
#> 5 -2.73  0.327
#> 6 -2.28  0.741

# Visualize MDS
mds_plot_data <- mds_points %>%
  mutate(Species = iris$Species)

ggplot(mds_plot_data, aes(x = Dim1, y = Dim2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "MDS of Iris Dataset") +
  theme_minimal()

Clustering

Clustering algorithms group similar observations together without using labels.

K-means Clustering

# Perform k-means with k=3
model_kmeans <- tl_model(iris[, 1:4], method = "kmeans", k = 3)
print(model_kmeans)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: kmeans 
#> Technique: kmeans 
#> 
#> Training observations: 150

# Extract cluster assignments
clusters <- model_kmeans$fit$clusters
head(clusters)
#> # A tibble: 6 × 2
#>   .obs_id cluster
#>   <chr>     <int>
#> 1 1             3
#> 2 2             3
#> 3 3             3
#> 4 4             3
#> 5 5             3
#> 6 6             3

# Compare clusters with actual species
table(Cluster = clusters$cluster, Species = iris$Species)
#>        Species
#> Cluster setosa versicolor virginica
#>       1      0          2        36
#>       2      0         48        14
#>       3     50          0         0

# Visualize clusters using PCA
cluster_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters$cluster),
    Species = iris$Species
  )

ggplot(cluster_viz, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "K-means Clusters vs True Species") +
  theme_minimal()

# Access cluster centers
centers <- model_kmeans$fit$centers
print(centers)
#> # A tibble: 3 × 5
#>   cluster Sepal.Length Sepal.Width Petal.Length Petal.Width
#>     <int>        <dbl>       <dbl>        <dbl>       <dbl>
#> 1       1         6.85        3.07         5.74       2.07 
#> 2       2         5.90        2.75         4.39       1.43 
#> 3       3         5.01        3.43         1.46       0.246

PAM (K-medoids)

PAM is more robust to outliers than k-means:

# Perform PAM clustering
model_pam <- tl_model(iris[, 1:4], method = "pam", k = 3)
print(model_pam)

# Extract clusters
clusters_pam <- model_pam$fit$clusters
table(Cluster = clusters_pam$cluster, Species = iris$Species)

Hierarchical Clustering

# Perform hierarchical clustering
model_hclust <- tl_model(iris[, 1:4], method = "hclust")
print(model_hclust)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: hclust 
#> Technique: hclust 
#> 
#> Training observations: 150

# Plot dendrogram
plot(model_hclust$fit$model, labels = FALSE, main = "Hierarchical Clustering of Iris")

# Cut tree to get clusters
k <- 3
clusters_hc <- cutree(model_hclust$fit$model, k = k)
table(Cluster = clusters_hc, Species = iris$Species)
#>        Species
#> Cluster setosa versicolor virginica
#>       1     50          0         0
#>       2      0         50        14
#>       3      0          0        36

# Visualize hierarchical clusters
hc_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters_hc),
    Species = iris$Species
  )

ggplot(hc_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Hierarchical Clustering Results") +
  theme_minimal()

DBSCAN (Density-Based Clustering)

DBSCAN can find arbitrarily shaped clusters and identify outliers:

# Perform DBSCAN
model_dbscan <- tl_model(iris[, 1:4], method = "dbscan", eps = 0.5, minPts = 5)
print(model_dbscan)

# Extract clusters (0 = noise/outliers)
clusters_dbscan <- model_dbscan$fit$clusters
table(clusters_dbscan$cluster)

# Compare with species
table(Cluster = clusters_dbscan$cluster, Species = iris$Species)

CLARA (for Large Datasets)

CLARA is efficient for large datasets:

# Create larger dataset
large_data <- iris[rep(1:nrow(iris), 10), 1:4]

# Perform CLARA
model_clara <- tl_model(large_data, method = "clara", k = 3, samples = 5)
print(model_clara)

# Extract clusters
clusters_clara <- model_clara$fit$clusters

Choosing the Number of Clusters

Elbow Method

# Try different values of k
k_values <- 2:8
within_ss <- numeric(length(k_values))

for (i in seq_along(k_values)) {
  k <- k_values[i]
  model <- tl_model(iris[, 1:4], method = "kmeans", k = k)
  within_ss[i] <- model$fit$model$tot.withinss
}

# Plot elbow curve
elbow_data <- data.frame(k = k_values, within_ss = within_ss)

ggplot(elbow_data, aes(x = k, y = within_ss)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  labs(
    title = "Elbow Method for Optimal k",
    x = "Number of Clusters (k)",
    y = "Total Within-Cluster Sum of Squares"
  ) +
  theme_minimal()

Predicting on New Data

Clustering New Observations

# Train clustering model
model_train <- tl_model(iris[1:100, 1:4], method = "kmeans", k = 3)

# Predict cluster assignments for new data
new_data <- iris[101:150, 1:4]
new_clusters <- predict(model_train, new_data = new_data)

head(new_clusters)
#> # A tibble: 6 × 1
#>   cluster
#>     <int>
#> 1       3
#> 2       3
#> 3       3
#> 4       3
#> 5       3
#> 6       3

Transforming New Data with PCA

# Train PCA model
pca_train <- tl_model(iris[1:100, 1:4], method = "pca")

# Transform new data
new_pca <- predict(pca_train, new_data = new_data)
head(new_pca)
#> # A tibble: 6 × 5
#>   .obs_id   PC1    PC2     PC3    PC4
#>   <chr>   <dbl>  <dbl>   <dbl>  <dbl>
#> 1 1        3.38 -1.28  -1.59    0.332
#> 2 2        2.53  0.265 -0.849   0.167
#> 3 3        3.78 -1.41  -0.0897  0.150
#> 4 4        2.85 -0.514 -0.495  -0.151
#> 5 5        3.39 -0.899 -0.797   0.201
#> 6 6        4.42 -1.85   0.295  -0.128

Combining Multiple Techniques

PCA followed by Clustering

# Reduce dimensions with PCA
pca_model <- tl_model(iris[, 1:4], method = "pca")
pca_data <- predict(pca_model)

# Select first 2 components
pca_reduced <- pca_data %>% select(PC1, PC2)

# Cluster in reduced space
kmeans_pca <- tl_model(pca_reduced, method = "kmeans", k = 3)
clusters_pca <- kmeans_pca$fit$clusters

# Visualize
viz_combined <- pca_data %>%
  mutate(
    Cluster = as.factor(clusters_pca$cluster),
    Species = iris$Species
  )

ggplot(viz_combined, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Clustering in PCA Space") +
  theme_minimal()

Practical Applications

Customer Segmentation

# Simulate customer data
set.seed(42)
customers <- data.frame(
  age = rnorm(200, 40, 15),
  income = rnorm(200, 50000, 20000),
  spending_score = rnorm(200, 50, 25)
)

# Standardize features
customers_scaled <- scale(customers) %>% as.data.frame()

# Cluster customers
customer_segments <- tl_model(customers_scaled, method = "kmeans", k = 4)
customers$segment <- customer_segments$fit$clusters$cluster

# Visualize segments
ggplot(customers, aes(x = income, y = spending_score, color = as.factor(segment))) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Customer Segmentation",
    color = "Segment"
  ) +
  theme_minimal()

Feature Extraction

# Use PCA for feature extraction
pca_features <- tl_model(mtcars, method = "pca")

# Keep components explaining 90% of variance
var_exp <- pca_features$fit$variance_explained
cumulative_var <- cumsum(var_exp$prop_variance)
n_components <- which(cumulative_var >= 0.90)[1]

cat("Components needed for 90% variance:", n_components, "\n")
#> Components needed for 90% variance: 4
cat("Original features:", ncol(mtcars), "\n")
#> Original features: 11
cat("Dimension reduction:", round((1 - n_components/ncol(mtcars)) * 100, 1), "%\n")
#> Dimension reduction: 63.6 %

Best Practices

Scale your data before clustering or PCA for fair feature comparison
Determine optimal k using elbow method or silhouette analysis
Try multiple methods - different algorithms work better for different data
Visualize results to understand cluster structure
Consider domain knowledge when interpreting clusters
Use PCA for visualization when data has more than 2-3 dimensions

Summary

tidylearn provides comprehensive unsupervised learning tools:

Dimensionality Reduction: PCA, MDS for visualization and feature extraction
Clustering: K-means, PAM, CLARA, hierarchical, DBSCAN
Unified API: Same tl_model() function for all methods
Prediction Support: Transform new data with learned patterns
Flexible: Works with different data types and sizes

# Complete unsupervised workflow
workflow_data <- iris[, 1:4]

# 1. Reduce dimensions
pca_final <- tl_model(workflow_data, method = "pca")

# 2. Cluster in reduced space
pca_coords <- predict(pca_final) %>% select(PC1, PC2)
clusters_final <- tl_model(pca_coords, method = "kmeans", k = 3)

# 3. Visualize
final_viz <- pca_coords %>%
  mutate(
    Cluster = as.factor(clusters_final$fit$clusters$cluster),
    Species = iris$Species
  )

ggplot(final_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Complete Unsupervised Workflow") +
  theme_minimal()