Data Cleaning and Preparation

library(clinpubr)
library(dplyr)

Introduction

Clinical data from Electronic Health Records (EHR) often contains inconsistencies, missing values, outliers, and formatting issues that must be addressed before analysis. This vignette demonstrates a structured cleaning workflow using clinpubr:

  1. Data Overview — Assess data quality
  2. Format Cleaning — Extract numbers, convert dates, check non-numeric values
  3. Unit Standardization — Harmonize measurement units
  4. Outlier Detection — Identify and manage outliers (after standardization)
  5. Missing Value Handling — Filter or handle missing data
  6. Data Transformation — Create derived variables

Why this order? Format cleaning and unit standardization must precede outlier detection — mixed units (e.g., mg/dL vs mmol/L) can cause false outlier flags.

Creating Realistic Messy Lab Data

Let’s create a sample laboratory dataset mimicking real-world EHR data with common quality issues. This dataset will be used throughout the vignette.

set.seed(123)
n <- 100

# Patient IDs
patient_ids <- paste0("P", sprintf("%03d", 1:n))

# Glucose: mix of mg/dL and mmol/L (1 mmol/L = 18 mg/dL)
# Also include some outliers and missing values
glucose_vals <- c(
  rnorm(35, mean = 100, sd = 15), # mg/dL normal range
  rnorm(30, mean = 5.5, sd = 0.8), # mmol/L normal range
  rnorm(10, mean = 200, sd = 20), # mg/dL high (outliers)
  rnorm(10, mean = 15, sd = 2), # mmol/L high (outliers)
  rep(NA, 10), # missing values
  999, 888, 0, -50, 500 # erroneous values
)
glucose_vals <- round(sample(glucose_vals, n), 1)

glucose_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))

# Creatinine: mix of mg/dL and umol/L (1 mg/dL = 88.4 umol/L)
creatinine_vals <- c(
  rnorm(35, mean = 1.0, sd = 0.2), # mg/dL normal range
  rnorm(30, mean = 88, sd = 15), # umol/L normal range
  rnorm(10, mean = 3.0, sd = 0.5), # mg/dL high (outliers)
  rnorm(10, mean = 265, sd = 44), # umol/L high (outliers)
  rep(NA, 10), # missing values
  999, 0, -10, 1000, 0.001 # erroneous values
)
creatinine_vals <- round(sample(creatinine_vals, n), 2)

creatinine_units <- sample(c("mg/dL", "umol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))

# Cholesterol: mix of mg/dL and mmol/L (1 mmol/L = 38.67 mg/dL)
cholesterol_vals <- c(
  rnorm(35, mean = 180, sd = 30), # mg/dL normal range
  rnorm(30, mean = 4.5, sd = 0.8), # mmol/L normal range
  rnorm(10, mean = 350, sd = 40), # mg/dL high (outliers)
  rnorm(10, mean = 9, sd = 1), # mmol/L high (outliers)
  rep(NA, 10), # missing values
  888, 999, 50, 600, 0 # erroneous values
)
cholesterol_vals <- round(sample(cholesterol_vals, n), 2)

cholesterol_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))

# Test dates with various formats and impossible dates
test_date_vals <- sample(c(
  format(Sys.Date() - sample(1:365, 60, replace = TRUE), "%Y-%m-%d"), # ISO format
  format(Sys.Date() - sample(1:365, 30, replace = TRUE), "%Y/%m/%d"), # Slash format
  "1900-01-01", "2030-12-31", "N/A", "", "unknown", "pending", # Invalid dates
  rep(NA, 10)
), n * 3, replace = TRUE)

# Create long-format lab data with messy values
lab_data <- data.frame(
  patient_id = rep(patient_ids, 3),
  test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = n),
  value = c(glucose_vals, creatinine_vals, cholesterol_vals),
  unit = c(glucose_units, creatinine_units, cholesterol_units),
  test_date = test_date_vals
)

# Add messy string values that need cleaning
# Glucose with text annotations and European decimal commas
messy_glucose_idx <- sample(which(lab_data$test == "Glucose"), 15)
lab_data$value[messy_glucose_idx] <- sample(c(
  "<40", ">500", "5.2", "6.8", "120 mg/dL", "89 mmol/L",
  "5.2 (fasting)", "180 (post-meal)", "N/A", "pending",
  "6..5", "7..2", "8.5.1", "normal range", "see comment"
), length(messy_glucose_idx), replace = TRUE)

# Creatinine with text annotations
messy_creatinine_idx <- sample(which(lab_data$test == "Creatinine"), 12)
lab_data$value[messy_creatinine_idx] <- sample(c(
  "<0.5", ">5.0", "1.1", "0.9", "1.2 mg/dL", "110 umol/L",
  "1.5 (dialysis)", "2.8 (critical)", "N/A", " hemolyzed",
  "1.3 ", " 1.4", "(1.2)", "1.0*", "see note"
), length(messy_creatinine_idx), replace = TRUE)

# Cholesterol with text annotations
messy_cholesterol_idx <- sample(which(lab_data$test == "Cholesterol"), 10)
lab_data$value[messy_cholesterol_idx] <- sample(c(
  "<100", ">400", "4.5", "5.2", "200 mg/dL", "5.5 mmol/L",
  "180 (fasting)", "250 (borderline)", "N/A", "lipemic"
), length(messy_cholesterol_idx), replace = TRUE)

knitr::kable(head(lab_data[sample(nrow(lab_data)), ], 10),
  caption = "Original Messy Lab Data (10 random rows)"
)
Original Messy Lab Data (10 random rows)
patient_id test value unit test_date
267 P067 Cholesterol 4.56 mg/dL 2026-04-29
81 P081 Glucose 5.4 mg/dL 2025-10-25
260 P060 Cholesterol 227.31 mmol/L 2026/03/08
76 P076 Glucose 218.4 mg/dL 2025-10-29
184 P084 Creatinine 263.9 mg/dL 2026-02-13
14 P014 Glucose 190.2 mg/dL 2025/12/01
51 P051 Glucose NA mg/dL 2025/07/15
155 P055 Creatinine 1.09 umol/L 2026-04-20
75 P075 Glucose 5.7 NA 2025-11-14
236 P036 Cholesterol 8.39 mg/dL 2025/12/30

Step 1: Data Overview

Data used: lab_data (the messy lab data created above)

Use data_overview() to get a comprehensive diagnostic report of data quality issues:

overview <- data_overview(lab_data)
#> === Data Overview Summary ===
#> Dataset: 300 rows, 5 columns
#> 
#> Variable Types:
#>   character : 5 variables
#> 
#> Found 4 potential quality issues:
#>   numeric_as_character     : 1 cases
#>   missing_values           : 3 cases
#> 
#> Recommendations:
#>   - Consider converting these character variables to numeric: value
#>   - Variables with < 50 % missing values: unit, value, test_date - consider imputation

print(overview$variable_types)
#> $character
#> [1] "patient_id" "test"       "value"      "unit"       "test_date"
print(overview$summary_stats)
#> $character
#>              variable   n missing missing_pct unique   top_value top_freq
#> patient_id patient_id 300       0        0.00    100        P001        3
#> test             test 300       0        0.00      3 Cholesterol      100
#> value           value 275      25        8.33    236           0        4
#> unit             unit 268      32       10.67      3       mg/dL      159
#> test_date   test_date 279      21        7.00     86  2025-08-31        9
#>            top_pct
#> patient_id    1.00
#> test         33.33
#> value         1.45
#> unit         59.33
#> test_date     3.23
print(overview$quality_issues$missing_values)
#>      unit     value test_date 
#>     10.67      8.33      7.00

Check for unit conflicts across different test types:

knitr::kable(unit_view(lab_data, subject_col = "test", value_col = "value", unit_col = "unit"),
  caption = "Unit Conflicts by Test Type"
)
Unit Conflicts by Test Type
subject unit label count nvalid mean sd median q_0.025 q_0.975
Cholesterol mg/dL NA 50 41 119.34902 175.46788 9.70 3.70000 342.5300
Cholesterol mmol/L NA 40 33 185.58394 197.11032 183.62 3.73600 657.6000
Cholesterol NA NA 10 8 102.32375 84.31264 141.32 0.56000 189.1785
Creatinine mg/dL NA 54 46 94.63500 212.69011 2.87 0.70000 915.3400
Creatinine umol/L NA 37 32 57.27156 82.24823 2.72 -2.25000 292.5532
Creatinine NA NA 9 8 74.75500 109.80029 41.49 0.14175 283.1100
Glucose mg/dL NA 55 41 88.75610 159.26087 16.30 0.00000 500.0000
Glucose mmol/L NA 32 26 71.28846 63.38110 84.30 4.56250 202.9750
Glucose NA NA 13 9 178.11111 314.49209 82.90 5.74000 841.0000

Step 2: Format Cleaning

Data used: lab_data

Format cleaning is essential before any numerical analysis. Lab data often contains mixed formats, text annotations, and inconsistent date representations.

2.1 Checking for Non-numeric Values

First, identify which entries cannot be converted to numeric values:

nonnum_df <- df_view_nonnum(lab_data)
knitr::kable(head(nonnum_df, 15), caption = "Non-numeric Entries by Variable")
Non-numeric Entries by Variable
patient_id test test_date unit value
P001 Glucose 2026/05/21 mg/dL N/A
P002 Creatinine pending mmol/L 7..2
P003 Cholesterol 2025/07/05 umol/L 180 (post-meal)
P004 NA 2025-12-06 NA >500
P005 NA 1900-01-01 NA <40
P006 NA 2026-04-19 NA 8.5.1
P007 NA 2026/04/10 NA 6..5
P008 NA 2025/12/30 NA see comment
P009 NA 2026-04-29 NA normal range
P010 NA 2026-04-20 NA 1.2 mg/dL
P011 NA 2025-08-31 NA 110 umol/L
P012 NA 2025/12/01 NA >5.0
P013 NA 2026/01/25 NA (1.2)
P014 NA 2025-12-23 NA >400
P015 NA 2025/10/19 NA 250 (borderline)

2.2 Extracting Numeric Values

Use extract_num() to clean messy numeric strings. This function handles: - Text annotations (e.g., “120 mg/dL” → 120) - Range indicators (e.g., “<40”, “>500”) - Extra decimal points (e.g., “5..2” → 5.2) - Extra whitespace and punctuation

# Create a copy for cleaning
lab_data_cleaned <- lab_data

# Extract numeric values from the messy value column
lab_data_cleaned$value_numeric <- extract_num(lab_data$value)

# Show before/after comparison
comparison <- data.frame(
  test = lab_data$test,
  original = lab_data$value,
  cleaned = lab_data_cleaned$value_numeric,
  unit = lab_data$unit
) %>%
  dplyr::filter(original != cleaned | is.na(cleaned)) %>%
  head(15)

knitr::kable(comparison, caption = "Value Cleaning: Before vs After")
Value Cleaning: Before vs After
test original cleaned unit
Glucose N/A NA mg/dL
Glucose 7..2 7.2 mg/dL
Glucose NA NA mmol/L
Glucose NA NA mg/dL
Glucose NA NA NA
Glucose N/A NA mg/dL
Glucose NA NA mg/dL
Glucose 180 (post-meal) 180.0 mmol/L
Glucose NA NA mmol/L
Glucose >500 500.0 mmol/L
Glucose NA NA mg/dL
Glucose <40 40.0 mg/dL
Glucose 8.5.1 8.5 mg/dL
Glucose 6..5 6.5 mg/dL
Glucose <40 40.0 NA

2.3 Date Conversion

Lab data often contains test dates in various formats. Use to_date() to standardize them:

# Convert various date formats to standard Date objects
lab_data_cleaned$test_date_clean <- to_date(lab_data$test_date)

# Show date conversion results
date_comparison <- data.frame(
  original = lab_data$test_date,
  cleaned = lab_data_cleaned$test_date_clean
) %>%
  dplyr::filter(!is.na(original)) %>%
  head(15)

knitr::kable(date_comparison, caption = "Date Conversion: Before vs After")
Date Conversion: Before vs After
original cleaned
2026/05/21 2026-05-21
pending NA
2025/07/05 2025-07-05
2025-12-06 2025-12-06
1900-01-01 1900-01-01
2026-04-19 2026-04-19
2026/04/10 2026-04-10
2025/12/30 2025-12-30
2026-04-29 2026-04-29
2025-12-06 2025-12-06
2026-04-20 2026-04-20
2025-08-31 2025-08-31
2025/12/01 2025-12-01
2025/12/01 2025-12-01
2026/01/25 2026-01-25

Check for invalid dates that couldn’t be parsed:

invalid_dates <- lab_data_cleaned %>%
  dplyr::filter(!is.na(test_date) & is.na(test_date_clean)) %>%
  dplyr::select(patient_id, test, test_date) %>%
  head(10)

knitr::kable(invalid_dates, caption = "Invalid Date Entries (Could Not Be Parsed)")
Invalid Date Entries (Could Not Be Parsed)
patient_id test test_date
P002 Glucose pending
P068 Glucose unknown
P079 Glucose unknown
P088 Glucose pending
P018 Creatinine unknown
P058 Creatinine N/A
P059 Creatinine N/A
P065 Creatinine
P073 Creatinine unknown
P080 Creatinine N/A

2.4 Update Data for Next Steps

Replace the original messy columns with cleaned versions:

lab_data <- lab_data_cleaned %>%
  dplyr::mutate(
    value = value_numeric,
    test_date = test_date_clean
  ) %>%
  dplyr::select(-value_numeric, -test_date_clean)

Step 3: Unit Standardization

Data used: lab_data

Unit standardization must be performed before outlier detection. This is critical when merging data from different laboratories. We will standardize all values to conventional units:

change_rules <- list(
  list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18),
  list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4),
  list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67)
)

lab_data_std <- unit_standardize(
  lab_data,
  subject_col = "test", value_col = "value",
  unit_col = "unit", change_rules = change_rules
)

knitr::kable(head(lab_data_std, 15), caption = "Lab Data After Unit Standardization")
Lab Data After Unit Standardization
patient_id test value unit test_date
P001 Glucose NA mg/dL 2026-05-21
P002 Glucose 2282.4 mg/dL NA
P003 Glucose 7.2 mg/dL 2025-07-05
P004 Glucose 95.6 mg/dL 2025-12-06
P005 Glucose 1512.0 mg/dL 1900-01-01
P006 Glucose 95.4 mg/dL 2026-04-19
P007 Glucose 186.2 mg/dL 2026-04-10
P008 Glucose 6.1 mg/dL 2025-12-30
P009 Glucose 257.4 mg/dL 2026-04-29
P010 Glucose NA mg/dL 2025-12-06
P011 Glucose 5.3 mg/dL 2026-04-20
P012 Glucose 6.6 mg/dL 2025-08-31
P013 Glucose 1614.6 mg/dL 2025-12-01
P014 Glucose 190.2 mg/dL 2025-12-01
P015 Glucose 5.1 mg/dL 2026-01-25

Verify that units are now standardized:

knitr::kable(unit_view(lab_data_std, subject_col = "test", value_col = "value", unit_col = "unit"),
  caption = "Units After Standardization"
)
Units After Standardization
subject unit label count nvalid mean sd median q_0.025 q_0.975

Step 4: Outlier Detection

Data used: lab_data_std (unit-standardized lab data)

Important: Outlier detection is performed after unit standardization to ensure consistent scales across all measurements.

The package provides three methods, each with different strengths:

Detect outliers for each test type using the IQR method:

# Split data by test type for outlier detection
lab_data_clean <- lab_data_std

# Detect outliers for each test
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
  test_data <- lab_data_std$value[lab_data_std$test == test_name]

  outlier_res <- detect_outliers(test_data, method = "iqr")

  cat("\n", test_name, ":\n")
  cat("  Total values:", length(test_data), "\n")
  cat("  Outliers detected:", sum(outlier_res$outlier_mask, na.rm = TRUE), "\n")
  cat("  Missing values:", sum(is.na(test_data)), "\n")
}
#> 
#>  Glucose :
#>   Total values: 100 
#>   Outliers detected: 19 
#>   Missing values: 14 
#> 
#>  Creatinine :
#>   Total values: 100 
#>   Outliers detected: 23 
#>   Missing values: 8 
#> 
#>  Cholesterol :
#>   Total values: 100 
#>   Outliers detected: 22 
#>   Missing values: 10

Compare outlier detection methods for Glucose:

glucose_values <- lab_data_std$value[lab_data_std$test == "Glucose"]

knitr::kable(data.frame(
  Method = c("MAD", "IQR", "Z-score"),
  Outlier_Count = c(
    sum(mad_outlier(glucose_values, threshold = 3), na.rm = TRUE),
    sum(iqr_outlier(glucose_values, threshold = 1.5), na.rm = TRUE),
    sum(zscore_outlier(glucose_values, threshold = 3), na.rm = TRUE)
  )
), caption = "Outlier Detection Results by Method (Glucose)")
Outlier Detection Results by Method (Glucose)
Method Outlier_Count
MAD 19
IQR 19
Z-score 1

Handling Outliers

For this demonstration, we will set outliers to NA. In practice, outliers should be manually verified — they may represent true extreme values (e.g., severe hyperglycemia) rather than data errors.

# Create a copy for outlier handling
lab_data_final <- lab_data_std

# Set outliers to NA for each test type
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
  test_idx <- lab_data_std$test == test_name
  test_values <- lab_data_std$value[test_idx]

  outlier_res <- detect_outliers(test_values, method = "iqr")

  # Set outliers to NA
  lab_data_final$value[test_idx][outlier_res$outlier_mask] <- NA
}

# Compare before and after outlier handling
comparison_df <- data.frame(
  Test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = 2),
  Stage = rep(c("Before", "After"), 3),
  N = c(
    sum(!is.na(lab_data_std$value[lab_data_std$test == "Glucose"])),
    sum(!is.na(lab_data_final$value[lab_data_final$test == "Glucose"])),
    sum(!is.na(lab_data_std$value[lab_data_std$test == "Creatinine"])),
    sum(!is.na(lab_data_final$value[lab_data_final$test == "Creatinine"])),
    sum(!is.na(lab_data_std$value[lab_data_std$test == "Cholesterol"])),
    sum(!is.na(lab_data_final$value[lab_data_final$test == "Cholesterol"]))
  ),
  Mean = c(
    mean(lab_data_std$value[lab_data_std$test == "Glucose"], na.rm = TRUE),
    mean(lab_data_final$value[lab_data_final$test == "Glucose"], na.rm = TRUE),
    mean(lab_data_std$value[lab_data_std$test == "Creatinine"], na.rm = TRUE),
    mean(lab_data_final$value[lab_data_final$test == "Creatinine"], na.rm = TRUE),
    mean(lab_data_std$value[lab_data_std$test == "Cholesterol"], na.rm = TRUE),
    mean(lab_data_final$value[lab_data_final$test == "Cholesterol"], na.rm = TRUE)
  )
)

knitr::kable(comparison_df, caption = "Before vs After Outlier Handling", digits = 2)
Before vs After Outlier Handling
Test Stage N Mean
Glucose Before 86 599.81
Glucose After 67 82.01
Creatinine Before 92 55.35
Creatinine After 69 1.11
Cholesterol Before 90 2818.59
Cholesterol After 68 147.00

Clinical Note: Always verify outliers before removal. Some “outliers” may be clinically significant (e.g., glucose > 400 mg/dL in diabetic ketoacidosis). Consider creating an outlier indicator variable for sensitivity analyses rather than deleting values.

Step 5: Missing Value Handling

Data used: lab_data_final (after standardization and outlier handling)

For missing data filtering by row/column missing rate, use get_valid_subset() to obtain a complete-enough subset for analysis. First, let’s convert to wide format to create a longitudinal cohort dataset where each row represents a patient-test date combination:

# Convert to wide format preserving all test dates
# This creates a longitudinal cohort dataset
lab_wide <- lab_data_final %>%
  filter(!is.na(test_date)) %>% # Remove records without valid dates
  select(patient_id, test_date, test, value) %>%
  tidyr::pivot_wider(
    names_from = test,
    values_from = value,
    names_sort = TRUE
  )

# Sort by patient and date for longitudinal view
lab_wide <- lab_wide %>%
  arrange(patient_id, test_date)

knitr::kable(head(lab_wide, 10), caption = "Longitudinal Lab Data (First 10 records)")
Longitudinal Lab Data (First 10 records)
patient_id test_date Cholesterol Creatinine Glucose
P001 2025-11-01 NA 1.13 NA
P001 2025-12-11 3.70 NA NA
P001 2026-05-21 NA NA NA
P002 2025-10-21 9.70 NA NA
P002 2025-12-29 NA NA NA
P003 2025-07-05 NA NA 7.2
P003 2026-02-26 NA NA NA
P003 2026-04-29 NA NA NA
P004 2025-11-19 329.49 NA NA
P004 2025-12-06 NA NA 95.6

View the cohort structure - number of tests per patient:

cohort_summary <- lab_wide %>%
  group_by(patient_id) %>%
  summarise(
    n_visits = n(),
    first_date = min(test_date),
    last_date = max(test_date)
  ) %>%
  head(10)

knitr::kable(cohort_summary, caption = "Cohort Structure: Visits per Patient")
Cohort Structure: Visits per Patient
patient_id n_visits first_date last_date
P001 3 2025-11-01 2026-05-21
P002 2 2025-10-21 2025-12-29
P003 3 2025-07-05 2026-04-29
P004 3 2025-11-19 2026-04-10
P005 3 1900-01-01 2026-02-19
P006 2 2026-03-05 2026-04-19
P007 2 2026-04-10 2030-12-31
P008 3 2025-09-24 2026-02-10
P009 3 2025-12-17 2026-04-29
P010 3 2025-08-25 2025-12-06

Assess missing data patterns:

missing_summary <- data.frame(
  Variable = names(lab_wide)[-1],
  Missing_Count = sapply(lab_wide[-1], function(x) sum(is.na(x))),
  Missing_Percent = round(sapply(lab_wide[-1], function(x) sum(is.na(x)) / length(x) * 100, 2))
)
#> Error in FUN(X[[i]], ...): 参数没有用(2)

knitr::kable(missing_summary, caption = "Missing Data Summary by Test")
#> Error: 找不到对象'missing_summary'

Filter records with too much missing data:

lab_wide_clean <- get_valid_subset(
  lab_wide,
  row_na_ratio = 0.5, # Allow up to 50% missing per record
  col_na_ratio = 0.3 # Allow up to 30% missing per test type
)

cat(
  "Dimensions:", nrow(lab_wide), "records",
  "->", nrow(lab_wide_clean), "records\n"
)
#> Dimensions: 259 records -> 259 records
cat(
  "Unique patients:", length(unique(lab_wide$patient_id)),
  "->", length(unique(lab_wide_clean$patient_id)), "\n"
)
#> Unique patients: 99 -> 99

Step 6: Data Transformation

Data used: lab_wide_clean (cleaned longitudinal cohort data)

Creating Clinical Categories

Create categorical variables from continuous lab values using clinical cut-points. In longitudinal data, each visit can have different classifications:

# Glucose categories
lab_wide_clean$glucose_category <- cut_by(
  lab_wide_clean$Glucose,
  breaks = c(100, 126),
  labels = c("Normal", "Prediabetes", "Diabetes"),
  label_with_range = FALSE
)
#> Error in cut_by(lab_wide_clean$Glucose, breaks = c(100, 126), labels = c("Normal", : Some of `breaks` are not in the range of `x`!

# Creatinine categories (eGFR approximation)
lab_wide_clean$renal_function <- cut_by(
  lab_wide_clean$Creatinine,
  breaks = c(1.2, 2.0, 4.0),
  labels = c("Normal", "Mild", "Moderate", "Severe"),
  label_with_range = FALSE
)
#> Error in cut_by(lab_wide_clean$Creatinine, breaks = c(1.2, 2, 4), labels = c("Normal", : Some of `breaks` are not in the range of `x`!

# Cholesterol categories
lab_wide_clean$cholesterol_category <- cut_by(
  lab_wide_clean$Cholesterol,
  breaks = c(200, 240),
  labels = c("Desirable", "Borderline", "High"),
  label_with_range = FALSE
)
#> Error in cut_by(lab_wide_clean$Cholesterol, breaks = c(200, 240), labels = c("Desirable", : Some of `breaks` are not in the range of `x`!

# Summary of categories across all visits
cat_summary <- data.frame(
  Category = c(
    rep("Glucose", length(table(lab_wide_clean$glucose_category))),
    rep("Renal Function", length(table(lab_wide_clean$renal_function))),
    rep("Cholesterol", length(table(lab_wide_clean$cholesterol_category)))
  ),
  Level = c(
    names(table(lab_wide_clean$glucose_category)),
    names(table(lab_wide_clean$renal_function)),
    names(table(lab_wide_clean$cholesterol_category))
  ),
  Count = c(
    as.vector(table(lab_wide_clean$glucose_category)),
    as.vector(table(lab_wide_clean$renal_function)),
    as.vector(table(lab_wide_clean$cholesterol_category))
  )
)

knitr::kable(cat_summary, caption = "Clinical Category Distributions (All Visits)")
Clinical Category Distributions (All Visits)
Category Count

Longitudinal View Example

View a single patient’s trajectory over time:

# Select one patient with multiple visits for demonstration
patient_trajectory <- lab_wide_clean %>%
  filter(patient_id == unique(patient_id)[1]) %>%
  select(
    patient_id, test_date, Glucose, Creatinine, Cholesterol,
    glucose_category, renal_function
  ) %>%
  arrange(test_date)
#> Error in `select()`:
#> ! Can't select columns that don't exist.
#> ✖ Column `Glucose` doesn't exist.

knitr::kable(patient_trajectory, caption = "Example: Single Patient's Lab Trajectory")
#> Error: 找不到对象'patient_trajectory'

Complete Cleaning Pipeline

Here’s a complete workflow from raw lab data to analysis-ready data:

# Step 1: Data Overview
overview <- data_overview(lab_data)
#> === Data Overview Summary ===
#> Dataset: 300 rows, 5 columns
#> 
#> Variable Types:
#>   numeric   : 1 variables
#>   character : 3 variables
#>   date      : 1 variables
#> 
#> Found 6 potential quality issues:
#>   outliers                 : 1 cases
#>   missing_values           : 3 cases
#>   negative_in_positive     : 1 cases
#>   suspicious_dates         : 1 cases
#> 
#> Recommendations:
#>   - Review outliers in these numeric variables: value
#>   - Variables with < 50 % missing values: test_date, value, unit - consider imputation
#>   - Numeric variables with mostly positive values but containing negatives: value
#>   - Review suspicious dates (year < 1910 or > current year) in: test_date

# Step 2: Format Cleaning - Extract numeric values and convert dates
clean <- lab_data
clean$value <- extract_num(lab_data$value)
clean$test_date <- to_date(lab_data$test_date)
#> Error: Input must be numeric or character vector, got: Date

# Step 3: Unit Standardization
change_rules <- list(
  list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18),
  list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4),
  list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67)
)

clean <- unit_standardize(
  clean,
  subject_col = "test", value_col = "value",
  unit_col = "unit", change_rules = change_rules
)

# Step 4: Outlier Detection and Handling
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
  test_idx <- clean$test == test_name
  test_values <- clean$value[test_idx]
  outlier_res <- detect_outliers(test_values, method = "iqr")
  clean$value[test_idx][outlier_res$outlier_mask] <- NA
}

# Step 5: Convert to wide format (longitudinal cohort data)
clean_wide <- clean %>%
  filter(!is.na(test_date)) %>%
  select(patient_id, test_date, test, value) %>%
  tidyr::pivot_wider(names_from = test, values_from = value) %>%
  arrange(patient_id, test_date)

# Step 6: Missing value filtering
clean_wide <- get_valid_subset(clean_wide, row_na_ratio = 0.5, col_na_ratio = 0.3)

# Step 7: Create clinical categories
clean_wide$glucose_category <- cut_by(
  clean_wide$Glucose,
  breaks = c(100, 126),
  labels = c("Normal", "Prediabetes", "Diabetes")
)
#> Error in cut_by(clean_wide$Glucose, breaks = c(100, 126), labels = c("Normal", : Some of `breaks` are not in the range of `x`!

# Step 8: Final check
final_overview <- data_overview(clean_wide)
#> === Data Overview Summary ===
#> Dataset: 259 rows, 2 columns
#> 
#> Variable Types:
#>   character : 1 variables
#>   date      : 1 variables
#> 
#> Found 1 potential quality issues:
#>   suspicious_dates         : 1 cases
#> 
#> Recommendations:
#>   - Review suspicious dates (year < 1910 or > current year) in: test_date
knitr::kable(final_overview$summary_stats, caption = "Final Data Quality Overview")
Final Data Quality Overview
variable n missing missing_pct unique top_value top_freq top_pct
patient_id patient_id 259 0 0 99 P001 3 1.16

cat(
  "\nOriginal records:", nrow(lab_data),
  "| Final records:", nrow(clean_wide),
  "| Removed:", nrow(lab_data) - nrow(clean_wide), "\n"
)
#> 
#> Original records: 300 | Final records: 259 | Removed: 41
cat(
  "Unique patients:", length(unique(lab_data$patient_id)),
  "->", length(unique(clean_wide$patient_id)), "\n"
)
#> Unique patients: 100 -> 99

Summary

Step Function Purpose Data Used
1. Data Overview data_overview() Comprehensive data quality assessment Raw lab data
2. Format Cleaning extract_num(), to_date(), df_view_nonnum() Clean messy strings, convert dates Raw lab data
3. Unit Standardization unit_standardize() Standardize measurement units Format-cleaned data
4. Outlier Detection detect_outliers(), iqr_outlier() Detect and handle outliers Standardized data
5. Missing Values get_valid_subset() Filter by missing rate Post-outlier data
6. Data Transformation cut_by() Create derived variables Cleaned data

Key Principles

  1. Standardize before outlier detection: Mixed units cause false outlier flags
  2. Verify outliers clinically: Some extreme values are clinically meaningful
  3. Track data loss: Document how many observations are removed at each step
  4. Preserve raw data: Never overwrite original data; create cleaned copies

mirror server hosted at Truenetwork, Russian Federation.