Data Cleaning and Preparation

Creating Realistic Messy Lab Data

Let’s create a sample laboratory dataset mimicking real-world EHR data with common quality issues. This dataset will be used throughout the vignette.

set.seed(123)
n <- 100

# Patient IDs
patient_ids <- paste0("P", sprintf("%03d", 1:n))

# Glucose: mix of mg/dL and mmol/L (1 mmol/L = 18 mg/dL)
# Also include some outliers and missing values
glucose_vals <- c(
  rnorm(35, mean = 100, sd = 15), # mg/dL normal range
  rnorm(30, mean = 5.5, sd = 0.8), # mmol/L normal range
  rnorm(10, mean = 200, sd = 20), # mg/dL high (outliers)
  rnorm(10, mean = 15, sd = 2), # mmol/L high (outliers)
  rep(NA, 10), # missing values
  999, 888, 0, -50, 500 # erroneous values
)
glucose_vals <- round(sample(glucose_vals, n), 1)

glucose_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))

# Creatinine: mix of mg/dL and umol/L (1 mg/dL = 88.4 umol/L)
creatinine_vals <- c(
  rnorm(35, mean = 1.0, sd = 0.2), # mg/dL normal range
  rnorm(30, mean = 88, sd = 15), # umol/L normal range
  rnorm(10, mean = 3.0, sd = 0.5), # mg/dL high (outliers)
  rnorm(10, mean = 265, sd = 44), # umol/L high (outliers)
  rep(NA, 10), # missing values
  999, 0, -10, 1000, 0.001 # erroneous values
)
creatinine_vals <- round(sample(creatinine_vals, n), 2)

creatinine_units <- sample(c("mg/dL", "umol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))

# Cholesterol: mix of mg/dL and mmol/L (1 mmol/L = 38.67 mg/dL)
cholesterol_vals <- c(
  rnorm(35, mean = 180, sd = 30), # mg/dL normal range
  rnorm(30, mean = 4.5, sd = 0.8), # mmol/L normal range
  rnorm(10, mean = 350, sd = 40), # mg/dL high (outliers)
  rnorm(10, mean = 9, sd = 1), # mmol/L high (outliers)
  rep(NA, 10), # missing values
  888, 999, 50, 600, 0 # erroneous values
)
cholesterol_vals <- round(sample(cholesterol_vals, n), 2)

cholesterol_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))

# Test dates with various formats and impossible dates
test_date_vals <- sample(c(
  format(Sys.Date() - sample(1:365, 60, replace = TRUE), "%Y-%m-%d"), # ISO format
  format(Sys.Date() - sample(1:365, 30, replace = TRUE), "%Y/%m/%d"), # Slash format
  "1900-01-01", "2030-12-31", "N/A", "", "unknown", "pending", # Invalid dates
  rep(NA, 10)
), n * 3, replace = TRUE)

# Create long-format lab data with messy values
lab_data <- data.frame(
  patient_id = rep(patient_ids, 3),
  test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = n),
  value = c(glucose_vals, creatinine_vals, cholesterol_vals),
  unit = c(glucose_units, creatinine_units, cholesterol_units),
  test_date = test_date_vals
)

# Add messy string values that need cleaning
# Glucose with text annotations and European decimal commas
messy_glucose_idx <- sample(which(lab_data$test == "Glucose"), 15)
lab_data$value[messy_glucose_idx] <- sample(c(
  "<40", ">500", "5.2", "6.8", "120 mg/dL", "89 mmol/L",
  "5.2 (fasting)", "180 (post-meal)", "N/A", "pending",
  "6..5", "7..2", "8.5.1", "normal range", "see comment"
), length(messy_glucose_idx), replace = TRUE)

# Creatinine with text annotations
messy_creatinine_idx <- sample(which(lab_data$test == "Creatinine"), 12)
lab_data$value[messy_creatinine_idx] <- sample(c(
  "<0.5", ">5.0", "1.1", "0.9", "1.2 mg/dL", "110 umol/L",
  "1.5 (dialysis)", "2.8 (critical)", "N/A", " hemolyzed",
  "1.3 ", " 1.4", "(1.2)", "1.0*", "see note"
), length(messy_creatinine_idx), replace = TRUE)

# Cholesterol with text annotations
messy_cholesterol_idx <- sample(which(lab_data$test == "Cholesterol"), 10)
lab_data$value[messy_cholesterol_idx] <- sample(c(
  "<100", ">400", "4.5", "5.2", "200 mg/dL", "5.5 mmol/L",
  "180 (fasting)", "250 (borderline)", "N/A", "lipemic"
), length(messy_cholesterol_idx), replace = TRUE)

knitr::kable(head(lab_data[sample(nrow(lab_data)), ], 10),
  caption = "Original Messy Lab Data (10 random rows)"
)

Original Messy Lab Data (10 random rows)
	patient_id	test	value	unit	test_date
267	P067	Cholesterol	4.56	mg/dL	2026-06-18
81	P081	Glucose	5.4	mg/dL	2025-12-14
260	P060	Cholesterol	227.31	mmol/L	2026/04/27
76	P076	Glucose	218.4	mg/dL	2025-12-18
184	P084	Creatinine	263.9	mg/dL	2026-04-04
14	P014	Glucose	190.2	mg/dL	2026/01/20
51	P051	Glucose	NA	mg/dL	2025/09/03
155	P055	Creatinine	1.09	umol/L	2026-06-09
75	P075	Glucose	5.7	NA	2026-01-03
236	P036	Cholesterol	8.39	mg/dL	2026/02/18

Step 1: Data Overview

Data used: lab_data (the messy lab data created above)

Use data_overview() to get a comprehensive diagnostic report of data quality issues:

overview <- data_overview(lab_data)
#> === Data Overview Summary ===
#> Dataset: 300 rows, 5 columns
#> 
#> Variable Types:
#>   character : 5 variables
#> 
#> Found 4 potential quality issues:
#>   numeric_as_character     : 1 cases
#>   missing_values           : 3 cases
#> 
#> Recommendations:
#>   - Consider converting these character variables to numeric: value
#>   - Variables with < 50 % missing values: unit, value, test_date - consider imputation

print(overview$variable_types)
#> $character
#> [1] "patient_id" "test"       "value"      "unit"       "test_date"
print(overview$summary_stats)
#> $character
#>              variable   n missing missing_pct unique   top_value top_freq
#> patient_id patient_id 300       0        0.00    100        P001        3
#> test             test 300       0        0.00      3 Cholesterol      100
#> value           value 275      25        8.33    236           0        4
#> unit             unit 268      32       10.67      3       mg/dL      159
#> test_date   test_date 279      21        7.00     86  2025-10-20        9
#>            top_pct
#> patient_id    1.00
#> test         33.33
#> value         1.45
#> unit         59.33
#> test_date     3.23
print(overview$quality_issues$missing_values)
#>      unit     value test_date 
#>     10.67      8.33      7.00

Check for unit conflicts across different test types:

knitr::kable(unit_view(lab_data, subject_col = "test", value_col = "value", unit_col = "unit"),
  caption = "Unit Conflicts by Test Type"
)

Unit Conflicts by Test Type
subject	unit	label	count	nvalid	mean	sd	median	q_0.025	q_0.975
Cholesterol	mg/dL	NA	50	41	119.34902	175.46788	9.70	3.70000	342.5300
Cholesterol	mmol/L	NA	40	33	185.58394	197.11032	183.62	3.73600	657.6000
Cholesterol	NA	NA	10	8	102.32375	84.31264	141.32	0.56000	189.1785
Creatinine	mg/dL	NA	54	46	94.63500	212.69011	2.87	0.70000	915.3400
Creatinine	umol/L	NA	37	32	57.27156	82.24823	2.72	-2.25000	292.5532
Creatinine	NA	NA	9	8	74.75500	109.80029	41.49	0.14175	283.1100
Glucose	mg/dL	NA	55	41	88.75610	159.26087	16.30	0.00000	500.0000
Glucose	mmol/L	NA	32	26	71.28846	63.38110	84.30	4.56250	202.9750
Glucose	NA	NA	13	9	178.11111	314.49209	82.90	5.74000	841.0000

Step 2: Format Cleaning

Data used: lab_data

Format cleaning is essential before any numerical analysis. Lab data often contains mixed formats, text annotations, and inconsistent date representations.

2.1 Checking for Non-numeric Values

First, identify which entries cannot be converted to numeric values:

nonnum_df <- df_view_nonnum(lab_data)
knitr::kable(head(nonnum_df, 15), caption = "Non-numeric Entries by Variable")

Non-numeric Entries by Variable
patient_id	test	test_date	unit	value
P001	Glucose	2026/07/10	mg/dL	N/A
P002	Creatinine	pending	mmol/L	7..2
P003	Cholesterol	2025/08/24	umol/L	180 (post-meal)
P004	NA	2026-01-25	NA	>500
P005	NA	1900-01-01	NA	<40
P006	NA	2026-06-08	NA	8.5.1
P007	NA	2026/05/30	NA	6..5
P008	NA	2026/02/18	NA	see comment
P009	NA	2026-06-18	NA	normal range
P010	NA	2026-06-09	NA	1.2 mg/dL
P011	NA	2025-10-20	NA	110 umol/L
P012	NA	2026/01/20	NA	>5.0
P013	NA	2026/03/16	NA	(1.2)
P014	NA	2026-02-11	NA	>400
P015	NA	2025/12/08	NA	250 (borderline)

2.2 Extracting Numeric Values

Use extract_num() to clean messy numeric strings. This function handles: - Text annotations (e.g., “120 mg/dL” → 120) - Range indicators (e.g., “<40”, “>500”) - Extra decimal points (e.g., “5..2” → 5.2) - Extra whitespace and punctuation

# Create a copy for cleaning
lab_data_cleaned <- lab_data

# Extract numeric values from the messy value column
lab_data_cleaned$value_numeric <- extract_num(lab_data$value)

# Show before/after comparison
comparison <- data.frame(
  test = lab_data$test,
  original = lab_data$value,
  cleaned = lab_data_cleaned$value_numeric,
  unit = lab_data$unit
) %>%
  dplyr::filter(original != cleaned | is.na(cleaned)) %>%
  head(15)

knitr::kable(comparison, caption = "Value Cleaning: Before vs After")

Value Cleaning: Before vs After
test	original	cleaned	unit
Glucose	N/A	NA	mg/dL
Glucose	7..2	7.2	mg/dL
Glucose	NA	NA	mmol/L
Glucose	NA	NA	mg/dL
Glucose	NA	NA	NA
Glucose	N/A	NA	mg/dL
Glucose	NA	NA	mg/dL
Glucose	180 (post-meal)	180.0	mmol/L
Glucose	NA	NA	mmol/L
Glucose	>500	500.0	mmol/L
Glucose	NA	NA	mg/dL
Glucose	<40	40.0	mg/dL
Glucose	8.5.1	8.5	mg/dL
Glucose	6..5	6.5	mg/dL
Glucose	<40	40.0	NA

2.3 Date Conversion

Lab data often contains test dates in various formats. Use to_date() to standardize them:

# Convert various date formats to standard Date objects
lab_data_cleaned$test_date_clean <- to_date(lab_data$test_date)

# Show date conversion results
date_comparison <- data.frame(
  original = lab_data$test_date,
  cleaned = lab_data_cleaned$test_date_clean
) %>%
  dplyr::filter(!is.na(original)) %>%
  head(15)

knitr::kable(date_comparison, caption = "Date Conversion: Before vs After")

Date Conversion: Before vs After
original	cleaned
2026/07/10	2026-07-10
pending	NA
2025/08/24	2025-08-24
2026-01-25	2026-01-25
1900-01-01	1900-01-01
2026-06-08	2026-06-08
2026/05/30	2026-05-30
2026/02/18	2026-02-18
2026-06-18	2026-06-18
2026-01-25	2026-01-25
2026-06-09	2026-06-09
2025-10-20	2025-10-20
2026/01/20	2026-01-20
2026/01/20	2026-01-20
2026/03/16	2026-03-16

Check for invalid dates that couldn’t be parsed:

invalid_dates <- lab_data_cleaned %>%
  dplyr::filter(!is.na(test_date) & is.na(test_date_clean)) %>%
  dplyr::select(patient_id, test, test_date) %>%
  head(10)

knitr::kable(invalid_dates, caption = "Invalid Date Entries (Could Not Be Parsed)")

Invalid Date Entries (Could Not Be Parsed)
patient_id	test	test_date
P002	Glucose	pending
P068	Glucose	unknown
P079	Glucose	unknown
P088	Glucose	pending
P018	Creatinine	unknown
P058	Creatinine	N/A
P059	Creatinine	N/A
P065	Creatinine
P073	Creatinine	unknown
P080	Creatinine	N/A

2.4 Update Data for Next Steps

Replace the original messy columns with cleaned versions:

lab_data <- lab_data_cleaned %>%
  dplyr::mutate(
    value = value_numeric,
    test_date = test_date_clean
  ) %>%
  dplyr::select(-value_numeric, -test_date_clean)

Step 3: Unit Standardization

Data used: lab_data

Unit standardization must be performed before outlier detection. This is critical when merging data from different laboratories. We will standardize all values to conventional units:

Glucose: mg/dL (mmol/L × 18 = mg/dL)
Creatinine: mg/dL (umol/L ÷ 88.4 = mg/dL)
Cholesterol: mg/dL (mmol/L × 38.67 = mg/dL)

change_rules <- list(
  list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18),
  list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4),
  list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67)
)

lab_data_std <- unit_standardize(
  lab_data,
  subject_col = "test", value_col = "value",
  unit_col = "unit", change_rules = change_rules
)

knitr::kable(head(lab_data_std, 15), caption = "Lab Data After Unit Standardization")

Lab Data After Unit Standardization
patient_id	test	value	unit	test_date
P002	Glucose	2282.4	mg/dL	NA
P003	Glucose	7.2	mg/dL	2025-08-24
P004	Glucose	95.6	mg/dL	2026-01-25
P005	Glucose	1512.0	mg/dL	1900-01-01
P006	Glucose	95.4	mg/dL	2026-06-08
P007	Glucose	186.2	mg/dL	2026-05-30
P008	Glucose	6.1	mg/dL	2026-02-18
P009	Glucose	257.4	mg/dL	2026-06-18
P011	Glucose	5.3	mg/dL	2026-06-09
P012	Glucose	6.6	mg/dL	2025-10-20
P013	Glucose	1614.6	mg/dL	2026-01-20
P014	Glucose	190.2	mg/dL	2026-01-20
P015	Glucose	5.1	mg/dL	2026-03-16
P016	Glucose	14.4	mg/dL	2026-02-11
P018	Glucose	6.7	mg/dL	NA

Verify that units are now standardized:

knitr::kable(unit_view(lab_data_std, subject_col = "test", value_col = "value", unit_col = "unit"),
  caption = "Units After Standardization"
)

Units After Standardization
subject	unit	label	count	nvalid	mean	sd	median	q_0.025	q_0.975

Step 4: Outlier Detection

Data used: lab_data_std (unit-standardized lab data)

Important: Outlier detection is performed after unit standardization to ensure consistent scales across all measurements.

The package provides three methods, each with different strengths:

MAD (Median Absolute Deviation): Robust to outliers, good for small samples
IQR (Interquartile Range): Most commonly used in clinical research
Z-score: Assumes normal distribution, use with caution for skewed data

Detect outliers for each test type using the IQR method:

# Split data by test type for outlier detection
lab_data_clean <- lab_data_std

# Detect outliers for each test
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
  test_data <- lab_data_std$value[lab_data_std$test == test_name]

  outlier_res <- detect_outliers(test_data, method = "iqr")

  cat("\n", test_name, ":\n")
  cat("  Total values:", length(test_data), "\n")
  cat("  Outliers detected:", sum(outlier_res$outlier_mask, na.rm = TRUE), "\n")
  cat("  Missing values:", sum(is.na(test_data)), "\n")
}
#> 
#>  Glucose :
#>   Total values: 86 
#>   Outliers detected: 19 
#>   Missing values: 0 
#> 
#>  Creatinine :
#>   Total values: 92 
#>   Outliers detected: 23 
#>   Missing values: 0 
#> 
#>  Cholesterol :
#>   Total values: 90 
#>   Outliers detected: 22 
#>   Missing values: 0

Compare outlier detection methods for Glucose:

glucose_values <- lab_data_std$value[lab_data_std$test == "Glucose"]

knitr::kable(data.frame(
  Method = c("MAD", "IQR", "Z-score"),
  Outlier_Count = c(
    sum(mad_outlier(glucose_values, threshold = 3), na.rm = TRUE),
    sum(iqr_outlier(glucose_values, threshold = 1.5), na.rm = TRUE),
    sum(zscore_outlier(glucose_values, threshold = 3), na.rm = TRUE)
  )
), caption = "Outlier Detection Results by Method (Glucose)")

Outlier Detection Results by Method (Glucose)
Method	Outlier_Count
MAD	19
IQR	19
Z-score	1

Handling Outliers

For this demonstration, we will set outliers to NA. In practice, outliers should be manually verified — they may represent true extreme values (e.g., severe hyperglycemia) rather than data errors.

# Create a copy for outlier handling
lab_data_final <- lab_data_std

# Set outliers to NA for each test type
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
  test_idx <- lab_data_std$test == test_name
  test_values <- lab_data_std$value[test_idx]

  outlier_res <- detect_outliers(test_values, method = "iqr")

  # Set outliers to NA
  lab_data_final$value[test_idx][outlier_res$outlier_mask] <- NA
}

# Compare before and after outlier handling
comparison_df <- data.frame(
  Test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = 2),
  Stage = rep(c("Before", "After"), 3),
  N = c(
    sum(!is.na(lab_data_std$value[lab_data_std$test == "Glucose"])),
    sum(!is.na(lab_data_final$value[lab_data_final$test == "Glucose"])),
    sum(!is.na(lab_data_std$value[lab_data_std$test == "Creatinine"])),
    sum(!is.na(lab_data_final$value[lab_data_final$test == "Creatinine"])),
    sum(!is.na(lab_data_std$value[lab_data_std$test == "Cholesterol"])),
    sum(!is.na(lab_data_final$value[lab_data_final$test == "Cholesterol"]))
  ),
  Mean = c(
    mean(lab_data_std$value[lab_data_std$test == "Glucose"], na.rm = TRUE),
    mean(lab_data_final$value[lab_data_final$test == "Glucose"], na.rm = TRUE),
    mean(lab_data_std$value[lab_data_std$test == "Creatinine"], na.rm = TRUE),
    mean(lab_data_final$value[lab_data_final$test == "Creatinine"], na.rm = TRUE),
    mean(lab_data_std$value[lab_data_std$test == "Cholesterol"], na.rm = TRUE),
    mean(lab_data_final$value[lab_data_final$test == "Cholesterol"], na.rm = TRUE)
  )
)

knitr::kable(comparison_df, caption = "Before vs After Outlier Handling", digits = 2)

Before vs After Outlier Handling
Test	Stage	N	Mean
Glucose	Before	86	599.81
Glucose	After	67	82.01
Creatinine	Before	92	55.35
Creatinine	After	69	1.11
Cholesterol	Before	90	2818.59
Cholesterol	After	68	147.00

Clinical Note: Always verify outliers before removal. Some “outliers” may be clinically significant (e.g., glucose > 400 mg/dL in diabetic ketoacidosis). Consider creating an outlier indicator variable for sensitivity analyses rather than deleting values.

Step 5: Missing Value Handling

Data used: lab_data_final (after standardization and outlier handling)

For missing data filtering by row/column missing rate, use get_valid_subset() to obtain a complete-enough subset for analysis. First, let’s convert to wide format to create a longitudinal cohort dataset where each row represents a patient-test date combination:

# Convert to wide format preserving all test dates
# This creates a longitudinal cohort dataset
lab_wide <- lab_data_final %>%
  filter(!is.na(test_date)) %>% # Remove records without valid dates
  select(patient_id, test_date, test, value) %>%
  tidyr::pivot_wider(
    names_from = test,
    values_from = value,
    names_sort = TRUE
  )

# Sort by patient and date for longitudinal view
lab_wide <- lab_wide %>%
  arrange(patient_id, test_date)

knitr::kable(head(lab_wide, 10), caption = "Longitudinal Lab Data (First 10 records)")

Longitudinal Lab Data (First 10 records)
patient_id	test_date	Cholesterol	Creatinine	Glucose
P001	2025-12-21	NA	1.13	NA
P001	2026-01-30	3.70	NA	NA
P002	2025-12-10	9.70	NA	NA
P002	2026-02-17	NA	NA	NA
P003	2025-08-24	NA	NA	7.2
P003	2026-04-17	NA	NA	NA
P003	2026-06-18	NA	NA	NA
P004	2026-01-08	329.49	NA	NA
P004	2026-01-25	NA	NA	95.6
P004	2026-05-30	NA	1.20	NA

View the cohort structure - number of tests per patient:

cohort_summary <- lab_wide %>%
  group_by(patient_id) %>%
  summarise(
    n_visits = n(),
    first_date = min(test_date),
    last_date = max(test_date)
  ) %>%
  head(10)

knitr::kable(cohort_summary, caption = "Cohort Structure: Visits per Patient")

Cohort Structure: Visits per Patient
patient_id	n_visits	first_date	last_date
P001	2	2025-12-21	2026-01-30
P002	2	2025-12-10	2026-02-17
P003	3	2025-08-24	2026-06-18
P004	3	2026-01-08	2026-05-30
P005	3	1900-01-01	2026-04-10
P006	2	2026-04-24	2026-06-08
P007	2	2026-05-30	2030-12-31
P008	3	2025-11-13	2026-04-01
P009	3	2026-02-05	2026-06-18
P010	2	2025-10-14	2025-12-21

Assess missing data patterns:

missing_summary <- data.frame(
  Variable = names(lab_wide)[-1],
  Missing_Count = sapply(lab_wide[-1], function(x) sum(is.na(x))),
  Missing_Percent = round(sapply(lab_wide[-1], function(x) sum(is.na(x)) / length(x) * 100, 2))
)
#> Error in `FUN()`:
#> ! 参数没有用(2)

knitr::kable(missing_summary, caption = "Missing Data Summary by Test")
#> Error:
#> ! 找不到对象'missing_summary'

Filter records with too much missing data:

lab_wide_clean <- get_valid_subset(
  lab_wide,
  row_na_ratio = 0.5, # Allow up to 50% missing per record
  col_na_ratio = 0.3 # Allow up to 30% missing per test type
)

cat(
  "Dimensions:", nrow(lab_wide), "records",
  "->", nrow(lab_wide_clean), "records\n"
)
#> Dimensions: 232 records -> 232 records
cat(
  "Unique patients:", length(unique(lab_wide$patient_id)),
  "->", length(unique(lab_wide_clean$patient_id)), "\n"
)
#> Unique patients: 99 -> 99

Step 6: Data Transformation

Data used: lab_wide_clean (cleaned longitudinal cohort data)

Creating Clinical Categories

Create categorical variables from continuous lab values using clinical cut-points. In longitudinal data, each visit can have different classifications:

# Glucose categories
lab_wide_clean$glucose_category <- cut_by(
  lab_wide_clean$Glucose,
  breaks = c(100, 126),
  labels = c("Normal", "Prediabetes", "Diabetes"),
  label_with_range = FALSE
)
#> Error in `cut_by()`:
#> ! Some of `breaks` are not in the range of `x`!

# Creatinine categories (eGFR approximation)
lab_wide_clean$renal_function <- cut_by(
  lab_wide_clean$Creatinine,
  breaks = c(1.2, 2.0, 4.0),
  labels = c("Normal", "Mild", "Moderate", "Severe"),
  label_with_range = FALSE
)
#> Error in `cut_by()`:
#> ! Some of `breaks` are not in the range of `x`!

# Cholesterol categories
lab_wide_clean$cholesterol_category <- cut_by(
  lab_wide_clean$Cholesterol,
  breaks = c(200, 240),
  labels = c("Desirable", "Borderline", "High"),
  label_with_range = FALSE
)
#> Error in `cut_by()`:
#> ! Some of `breaks` are not in the range of `x`!

# Summary of categories across all visits
cat_summary <- data.frame(
  Category = c(
    rep("Glucose", length(table(lab_wide_clean$glucose_category))),
    rep("Renal Function", length(table(lab_wide_clean$renal_function))),
    rep("Cholesterol", length(table(lab_wide_clean$cholesterol_category)))
  ),
  Level = c(
    names(table(lab_wide_clean$glucose_category)),
    names(table(lab_wide_clean$renal_function)),
    names(table(lab_wide_clean$cholesterol_category))
  ),
  Count = c(
    as.vector(table(lab_wide_clean$glucose_category)),
    as.vector(table(lab_wide_clean$renal_function)),
    as.vector(table(lab_wide_clean$cholesterol_category))
  )
)

knitr::kable(cat_summary, caption = "Clinical Category Distributions (All Visits)")

Clinical Category Distributions (All Visits)
Category	Count

Longitudinal View Example

View a single patient’s trajectory over time:

# Select one patient with multiple visits for demonstration
patient_trajectory <- lab_wide_clean %>%
  filter(patient_id == unique(patient_id)[1]) %>%
  select(
    patient_id, test_date, Glucose, Creatinine, Cholesterol,
    glucose_category, renal_function
  ) %>%
  arrange(test_date)
#> Error in `select()`:
#> ! Can't select columns that don't exist.
#> ✖ Column `Glucose` doesn't exist.

knitr::kable(patient_trajectory, caption = "Example: Single Patient's Lab Trajectory")
#> Error:
#> ! 找不到对象'patient_trajectory'

Complete Cleaning Pipeline

Here’s a complete workflow from raw lab data to analysis-ready data:

# Step 1: Data Overview
overview <- data_overview(lab_data)
#> === Data Overview Summary ===
#> Dataset: 300 rows, 5 columns
#> 
#> Variable Types:
#>   numeric   : 1 variables
#>   character : 3 variables
#>   date      : 1 variables
#> 
#> Found 6 potential quality issues:
#>   outliers                 : 1 cases
#>   missing_values           : 3 cases
#>   negative_in_positive     : 1 cases
#>   suspicious_dates         : 1 cases
#> 
#> Recommendations:
#>   - Review outliers in these numeric variables: value
#>   - Variables with < 50 % missing values: test_date, value, unit - consider imputation
#>   - Numeric variables with mostly positive values but containing negatives: value
#>   - Review suspicious dates (year < 1910 or > current year) in: test_date

# Step 2: Format Cleaning - Extract numeric values and convert dates
clean <- lab_data
clean$value <- extract_num(lab_data$value)
clean$test_date <- to_date(lab_data$test_date)
#> Error:
#> ! Input must be numeric or character vector, got: Date

# Step 3: Unit Standardization
change_rules <- list(
  list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18),
  list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4),
  list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67)
)

clean <- unit_standardize(
  clean,
  subject_col = "test", value_col = "value",
  unit_col = "unit", change_rules = change_rules
)

# Step 4: Outlier Detection and Handling
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
  test_idx <- clean$test == test_name
  test_values <- clean$value[test_idx]
  outlier_res <- detect_outliers(test_values, method = "iqr")
  clean$value[test_idx][outlier_res$outlier_mask] <- NA
}

# Step 5: Convert to wide format (longitudinal cohort data)
clean_wide <- clean %>%
  filter(!is.na(test_date)) %>%
  select(patient_id, test_date, test, value) %>%
  tidyr::pivot_wider(names_from = test, values_from = value) %>%
  arrange(patient_id, test_date)

# Step 6: Missing value filtering
clean_wide <- get_valid_subset(clean_wide, row_na_ratio = 0.5, col_na_ratio = 0.3)

# Step 7: Create clinical categories
clean_wide$glucose_category <- cut_by(
  clean_wide$Glucose,
  breaks = c(100, 126),
  labels = c("Normal", "Prediabetes", "Diabetes")
)
#> Error in `cut_by()`:
#> ! Some of `breaks` are not in the range of `x`!

# Step 8: Final check
final_overview <- data_overview(clean_wide)
#> === Data Overview Summary ===
#> Dataset: 232 rows, 2 columns
#> 
#> Variable Types:
#>   character : 1 variables
#>   date      : 1 variables
#> 
#> Found 1 potential quality issues:
#>   suspicious_dates         : 1 cases
#> 
#> Recommendations:
#>   - Review suspicious dates (year < 1910 or > current year) in: test_date
knitr::kable(final_overview$summary_stats, caption = "Final Data Quality Overview")

Final Data Quality Overview

	variable	n	missing	missing_pct	unique	top_value	top_freq	top_pct
patient_id	patient_id	232	0	0	99	P003	3	1.29


cat(
  "\nOriginal records:", nrow(lab_data),
  "| Final records:", nrow(clean_wide),
  "| Removed:", nrow(lab_data) - nrow(clean_wide), "\n"
)
#> 
#> Original records: 300 | Final records: 232 | Removed: 68
cat(
  "Unique patients:", length(unique(lab_data$patient_id)),
  "->", length(unique(clean_wide$patient_id)), "\n"
)
#> Unique patients: 100 -> 99

Summary

Step	Function	Purpose	Data Used
1. Data Overview	`data_overview()`	Comprehensive data quality assessment	Raw lab data
2. Format Cleaning	`extract_num()`, `to_date()`, `df_view_nonnum()`	Clean messy strings, convert dates	Raw lab data
3. Unit Standardization	`unit_standardize()`	Standardize measurement units	Format-cleaned data
4. Outlier Detection	`detect_outliers()`, `iqr_outlier()`	Detect and handle outliers	Standardized data
5. Missing Values	`get_valid_subset()`	Filter by missing rate	Post-outlier data
6. Data Transformation	`cut_by()`	Create derived variables	Cleaned data

Key Principles

Standardize before outlier detection: Mixed units cause false outlier flags
Verify outliers clinically: Some extreme values are clinically meaningful
Track data loss: Document how many observations are removed at each step
Preserve raw data: Never overwrite original data; create cleaned copies