Advanced Data Operations

library(clinpubr)
library(dplyr)
library(survival)

Introduction

Clinical research often requires complex data manipulation: multi-source integration, fuzzy matching, temporal alignment, and format conversion. This vignette covers the advanced tools in clinpubr for these tasks:

  1. Cohort Screening — CONSORT-style exclusion tracking
  2. Data Merging — Substring and time-range matching
  3. Data Transformation — Long/wide format, multi-choice data
  4. Utility Functions — String manipulation, list operations
  5. Multi-Table Screeningscreen_data_list() for complex cohort workflows

Cohort Screening with Exclusion Tracking

exclusion_count() creates a CONSORT-style flow diagram, tracking how many patients are excluded at each stage:

set.seed(123)

patients <- data.frame(
  patient_id = 1:100,
  age = round(rnorm(100, 60, 15)),
  gender = sample(c("M", "F"), 100, replace = TRUE),
  has_baseline_data = sample(c(TRUE, FALSE), 100, prob = c(0.9, 0.1), replace = TRUE),
  meets_inclusion = sample(c(TRUE, FALSE), 100, prob = c(0.8, 0.2), replace = TRUE),
  complete_followup = sample(c(TRUE, FALSE), 100, prob = c(0.85, 0.15), replace = TRUE)
)

exclusion_summary <- exclusion_count(
  patients,
  age < 50,
  gender != "M",
  !has_baseline_data,
  !meets_inclusion,
  !complete_followup
)

knitr::kable(exclusion_summary, caption = "CONSORT-Style Cohort Flow")
CONSORT-Style Cohort Flow
Criteria N
Initial N 100
age < 50 16
gender != “M” 44
!has_baseline_data 7
!meets_inclusion 6
!complete_followup 7
Final N 20

cat("Retention rate:", round(100 * exclusion_summary$N[nrow(exclusion_summary)] / exclusion_summary$N[1], 1), "%\n")
#> Retention rate: 20 %

Customize exclusion criteria names:

exclusion_flow <- exclusion_count(
  patients,
  age < 18 | age > 80,
  !has_baseline_data,
  !meets_inclusion,
  .criteria_names = c(
    "Age outside 18-80 range",
    "Missing baseline data",
    "Does not meet inclusion criteria"
  )
)

knitr::kable(exclusion_flow, caption = "Custom Exclusion Flow")
Custom Exclusion Flow
Criteria N
Initial N 100
Age outside 18-80 range 9
Missing baseline data 10
Does not meet inclusion criteria 14
Final N 67

Data Merging Strategies

Merge by Substring Matching

merge_by_substring() matches records when exact identifiers don’t align (e.g., mapping free-text diagnoses to ICD codes):

medical_terms <- data.frame(
  match_term = c(
    "Type 2 Diabetes", "Hypertension", "Coronary Artery Disease",
    "Coronary Disease", "Chronic Kidney Disease", "Heart Failure", "Atrial Fibrillation"
  ),
  standard_term = c(
    "Type 2 Diabetes Mellitus", "Hypertension", "Coronary Artery Disease",
    "Coronary Artery Disease", "Chronic Kidney Disease", "Heart Failure", "Atrial Fibrillation"
  ),
  icd_code = c("E11", "I10", "I25", "I25", "N18", "I50", "I48"),
  category = c(
    "Endocrine", "Cardiovascular", "Cardiovascular", "Cardiovascular",
    "Renal", "Cardiovascular", "Cardiovascular"
  )
)

patient_diagnoses <- data.frame(
  patient_id = 1:10,
  diagnosis_text = c(
    "Severe Type 2 Diabetes", "Type 2 Diabetes Mellitus",
    "Type 2 Diabetes with Complications", "Essential Hypertension",
    "Hypertensive disease", "CAD - Coronary Artery Disease",
    "Coronary disease", "CKD Stage 3",
    "Congestive Heart Failure", "Heart failure chronic"
  )
)

merged_substring <- merge_by_substring(
  data = patient_diagnoses,
  key_df = medical_terms,
  search_col = "diagnosis_text",
  key_col = "match_term",
  value_cols = c("standard_term", "icd_code", "category")
)

knitr::kable(medical_terms, caption = "Medical Terms Table")
Medical Terms Table
match_term standard_term icd_code category
Type 2 Diabetes Type 2 Diabetes Mellitus E11 Endocrine
Hypertension Hypertension I10 Cardiovascular
Coronary Artery Disease Coronary Artery Disease I25 Cardiovascular
Coronary Disease Coronary Artery Disease I25 Cardiovascular
Chronic Kidney Disease Chronic Kidney Disease N18 Renal
Heart Failure Heart Failure I50 Cardiovascular
Atrial Fibrillation Atrial Fibrillation I48 Cardiovascular
knitr::kable(merged_substring, caption = "Diagnoses with Mapped ICD Codes")
Diagnoses with Mapped ICD Codes
diagnosis_text patient_id standard_term icd_code category
CAD - Coronary Artery Disease 6 Coronary Artery Disease I25 Cardiovascular
CKD Stage 3 8 NA NA NA
Congestive Heart Failure 9 Heart Failure I50 Cardiovascular
Coronary disease 7 Coronary Artery Disease I25 Cardiovascular
Essential Hypertension 4 Hypertension I10 Cardiovascular
Heart failure chronic 10 Heart Failure I50 Cardiovascular
Hypertensive disease 5 NA NA NA
Severe Type 2 Diabetes 1 Type 2 Diabetes Mellitus E11 Endocrine
Type 2 Diabetes Mellitus 2 Type 2 Diabetes Mellitus E11 Endocrine
Type 2 Diabetes with Complications 3 Type 2 Diabetes Mellitus E11 Endocrine

Merge by Range (Time-Based Matching)

merge_by_range() matches events within specific time windows (e.g., lab results during hospitalization):

patient_visits <- data.frame(
  patient_id = rep(1:3, each = 2),
  visit_id = 1:6,
  visit_start = as.Date(c(
    "2023-01-01", "2023-06-01", "2023-02-01",
    "2023-07-01", "2023-03-01", "2023-08-01"
  )),
  visit_end = as.Date(c(
    "2023-01-10", "2023-06-10", "2023-02-10",
    "2023-07-10", "2023-03-10", "2023-08-10"
  ))
)

lab_results <- data.frame(
  lab_id = 1:6,
  patient_id = c(1, 1, 2, 2, 3, 3),
  test_date = as.Date(c(
    "2023-01-05", "2023-06-05", "2023-02-03",
    "2023-07-08", "2023-03-15", "2023-08-05"
  )),
  test_name = c("Glucose", "HbA1c", "Glucose", "Creatinine", "Glucose", "Lipid panel"),
  result = round(rnorm(6, 100, 15), 1)
)

merged_range <- merge_by_range(
  x = patient_visits, y = lab_results,
  by = "patient_id",
  x_start = "visit_start", x_end = "visit_end",
  y_val = "test_date"
)

knitr::kable(merged_range, caption = "Labs Matched to Visit Windows")
Labs Matched to Visit Windows
patient_id visit_id visit_start visit_end lab_id test_date test_name result since_start
1 1 2023-01-01 2023-01-10 1 2023-01-05 Glucose 89.3 4
1 2 2023-06-01 2023-06-10 2 2023-06-05 HbA1c 88.7 4
2 3 2023-02-01 2023-02-10 3 2023-02-03 Glucose 85.9 2
2 4 2023-07-01 2023-07-10 4 2023-07-08 Creatinine 84.2 7
3 6 2023-08-01 2023-08-10 6 2023-08-05 Lipid panel 105.0 4
NA NA NA NA 5 2023-03-15 Glucose 93.4 NA

Data Transformation

Long to Wide Format

to_wide() converts long-format data (one row per measurement) to wide format (one row per patient) to facilitate analysis:

long_labs <- data.frame(
  patient_id = rep(1:5, each = 3),
  visit = rep(c(1, 2), times = c(8, 7)),
  test = rep(c("glucose", "creatinine", "cholesterol"), 5),
  value = round(rnorm(15, 100, 20), 1)
)

wide_labs <- to_wide(
  df = long_labs,
  keys = c("patient_id", "visit"),
  item_col = "test",
  value_col = "value"
)

knitr::kable(head(long_labs), caption = "Laboratory Data - Long Format")
Laboratory Data - Long Format
patient_id visit test value
1 1 glucose 59.7
1 1 creatinine 104.2
1 1 cholesterol 124.7
2 1 glucose 140.8
2 1 creatinine 126.0
2 1 cholesterol 115.1
knitr::kable(head(wide_labs), caption = "Laboratory Data - Wide Format")
Laboratory Data - Wide Format
patient_id visit cholesterol creatinine glucose
1 1 124.7 104.2 59.7
2 1 115.1 126.0 140.8
3 1 NA 88.0 65.5
3 2 93.0 NA NA
4 2 74.8 97.9 114.1
5 2 104.7 118.2 133.7

Multi-Choice Data

split_multichoice() splits comma-separated multi-choice columns into binary indicators; combine_multichoice() recombines them:

set.seed(456)
survey_data <- data.frame(
  id = 1:20,
  symptoms = sapply(1:20, function(x) {
    paste(sample(
      c("fever", "cough", "headache", "fatigue"),
      sample(1:4, 1)
    ), collapse = ",")
  }),
  comorbidities = sapply(1:20, function(x) {
    paste(sample(
      c("diabetes", "hypertension"),
      sample(1:2, 1)
    ), collapse = ",")
  })
)

symptoms_split <- split_multichoice(
  survey_data,
  quest_cols = c("symptoms", "comorbidities"),
  split = ",",
  remove_space = FALSE
)

knitr::kable(head(survey_data), caption = "Multi-Choice Data")
Multi-Choice Data
id symptoms comorbidities
1 fever diabetes
2 cough,fever,fatigue diabetes,hypertension
3 fever,cough,fatigue hypertension
4 fever,headache diabetes
5 headache,fatigue hypertension,diabetes
6 fatigue,cough,headache diabetes

knitr::kable(head(symptoms_split), caption = "Split Multi-Choice Data")
Split Multi-Choice Data
id symptoms_fever symptoms_cough symptoms_fatigue symptoms_headache comorbidities_diabetes comorbidities_hypertension
1 TRUE FALSE FALSE FALSE TRUE FALSE
2 TRUE TRUE TRUE FALSE TRUE TRUE
3 TRUE TRUE TRUE FALSE FALSE TRUE
4 TRUE FALSE FALSE TRUE TRUE FALSE
5 FALSE FALSE TRUE TRUE TRUE TRUE
6 FALSE TRUE TRUE TRUE TRUE FALSE

combined <- combine_multichoice(
  symptoms_split,
  quest_cols = list(
    respiratory = c("symptoms_cough", "symptoms_fatigue"),
    systemic = c("symptoms_fever", "symptoms_headache")
  ),
  sep = ","
)

knitr::kable(head(combined), caption = "Combined Symptom Groups")
Combined Symptom Groups
id comorbidities_diabetes comorbidities_hypertension respiratory systemic
1 TRUE FALSE fever
2 TRUE TRUE cough,fatigue fever
3 FALSE TRUE cough,fatigue fever
4 TRUE FALSE fever,headache
5 TRUE TRUE fatigue headache
6 TRUE FALSE cough,fatigue headache

Utility Functions

String Manipulation

common_prefix() extracts the common prefix from a character vector, could be used when processing hospital exports to identify files from the same patient or batch processing multi-site survey data by site prefix.

file_names <- c("patient_001_lab.csv", "patient_001_visit.csv", "patient_001_demo.csv")
common_prefix(file_names)
#> [1] "patient_001_"

str_match_replace() replaces matched patterns in strings with specified replacements, could be used when standardizing lab test names across hospitals or unifying option labels across translated questionnaires.

test_names <- c("Glucose_Fasting", "Glucose_Random", "Cholesterol_Total", "LDL_Calculated")
standardized <- str_match_replace(
  x = test_names,
  to_match = c("Glucose", "Cholesterol", "LDL"),
  to_replace = c("GLU", "CHOL", "LDL_CHOL")
)

knitr::kable(data.frame(Original = test_names, Standardized = standardized),
  caption = "Test Name Standardization"
)
Test Name Standardization
Original Standardized
Glucose_Fasting GLU_Fasting
Glucose_Random GLU_Random
Cholesterol_Total CHOL_Total
LDL_Calculated LDL_CHOL_Calculated

List and Data Frame Operations

add_lists() adds corresponding elements of two lists element-wise, could be used when aggregating disease incidence counts across time periods or summing response frequencies across survey sites.

list1 <- list(diabetes = 10, hypertension = 20, asthma = 5)
list2 <- list(diabetes = 15, hypertension = 25, asthma = 8)
add_lists(list1, list2)
#> $diabetes
#> [1] 25
#> 
#> $hypertension
#> [1] 45
#> 
#> $asthma
#> [1] 13

merge_ordered_vectors() merges multiple ordered vectors preserving unique elements and original ordering, could be used when combining inclusion criteria lists from multi-site studies or merging question orders across questionnaire versions.

sites <- list(
  c("Diabetes", "Hypertension", "Heart Failure", "CKD"),
  c("Hypertension", "COPD", "Diabetes", "Cancer"),
  c("CKD", "Diabetes", "Stroke", "Hypertension")
)
merge_ordered_vectors(sites)
#> [1] "COPD"          "Diabetes"      "Stroke"        "Hypertension" 
#> [5] "Heart Failure" "Cancer"        "CKD"

replace_elements() replaces specified elements in a vector with new values, could be used when correcting data entry errors in clinical data or harmonizing option codes across survey interviewers.

sample_data <- data.frame(
  group = c("Cntrol", "Treetment", "Placebo", "Cntrol", "Treatment"),
  value = 1:5
)
recoded <- replace_elements(
  x = sample_data$group,
  from = c("Cntrol", "Treetment"),
  to = c("Control", "Treatment")
)
knitr::kable(data.frame(Original = sample_data$group, Recoded = recoded),
  caption = "Group Recoding"
)
Group Recoding
Original Recoded
Cntrol Control
Treetment Treatment
Placebo Placebo
Cntrol Control
Treatment Treatment

fill_with_last() fills missing values with the last valid observation (LOCF), could be used when handling merged cells from Excel imports of EMR data or filling respondent demographics across repeated survey measures.

time_series <- c(120, NA, NA, 125, NA, 130, NA, NA)
knitr::kable(data.frame(Original = time_series, Filled = fill_with_last(time_series)),
  caption = "Last Observation Carried Forward"
)
Last Observation Carried Forward
Original Filled
120 120
NA 120
NA 120
125 125
NA 125
130 130
NA 130
NA 130

Multi-Table Cohort Screening

screen_data_list() filters and joins multiple tables based on clinical criteria, with full audit trails. This is essential for retrospective EHR studies.

Scenario: Building a Diabetes Cohort

# Patient demographics
patient <- data.frame(
  pid = 1:50,
  age = sample(25:75, 50, replace = TRUE),
  gender = sample(c("M", "F"), 50, replace = TRUE)
)

# Admission records
admission <- data.frame(
  pid = c(1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = 101:115,
  admit_day = c(1, 30, 5, 45, 10, 60, 15, 20, 25, 35, 40, 50, 55, 65, 70)
)

# Diagnosis records
diagnosis <- data.frame(
  pid = c(1, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = c(101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115),
  dx_day = c(1, 30, 5, 45, 10, 15, 20, 25, 35, 40, 50, 55, 65, 70),
  icd = c("E11", "I10", "N18", "E11", "E11", "I25", "E11", "J18", "E11", "I10", "E11", "N18", "E11", "I50")
)

# Lab results
lab <- data.frame(
  pid = c(1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = c(101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115),
  lab_day = c(1, 30, 5, 45, 10, 60, 15, 20, 25, 35, 40, 50, 55, 65, 70),
  HbA1c = c(8.5, 7.2, 9.1, 7.8, 8.0, 6.9, 7.5, 8.2, 7.0, 9.5, 7.8, 8.1, 7.3, 8.8, 7.6)
)

Screen and Filter

Select patients with diabetes (ICD: E11), keeping records from first diabetes diagnosis onward:

screened_data <- screen_data_list(
  data_list = list(
    patient = patient, admission = admission,
    diagnosis = diagnosis, lab = lab
  ),
  entry_expr = any(icd == "E11"),
  entry_level = "patient_id",
  anchor_expr = icd == "E11",
  anchor_level = "date",
  patient_id_map = "pid",
  visit_id_map = c(admission = "vid", diagnosis = "vid", lab = "vid"),
  date_map = c(admission = "admit_day", diagnosis = "dx_day", lab = "lab_day"),
  output = "list",
  return_audit = TRUE
)

knitr::kable(screened_data$audit$entry_scope, caption = "Entry Stage Audit")
Entry Stage Audit
step table before after removed
entry_scope patient 50 7 43
entry_scope admission 15 10 5
entry_scope diagnosis 14 9 5
entry_scope lab 15 10 5
knitr::kable(screened_data$audit$anchor_scope, caption = "Anchor Stage Audit")
Anchor Stage Audit
step table before after removed
anchor_scope patient 7 7 0
anchor_scope admission 10 9 1
anchor_scope diagnosis 9 8 1
anchor_scope lab 10 9 1

Review Filtered Data

knitr::kable(head(screened_data$data$patient), caption = "Filtered Patients")
Filtered Patients
pid age gender
1 1 51 M
2 2 72 M
3 3 38 M
5 5 75 F
7 7 59 F
9 9 28 M
knitr::kable(screened_data$data$diagnosis, caption = "Filtered Diagnoses")
Filtered Diagnoses
pid vid dx_day icd
1 1 101 1 E11
2 1 102 30 I10
4 2 104 45 E11
5 3 105 10 E11
7 5 108 20 E11
9 7 110 35 E11
11 9 112 50 E11
13 11 114 65 E11
knitr::kable(screened_data$data$lab, caption = "Filtered Labs")
Filtered Labs
pid vid lab_day HbA1c
1 1 101 1 8.5
2 1 102 30 7.2
4 2 104 45 7.8
5 3 105 10 8.0
6 3 106 60 6.9
8 5 108 20 8.2
10 7 110 35 9.5
12 9 112 50 8.1
14 11 114 65 8.8

Joined Output

Alternatively, get a single joined data frame:

joined_result <- screen_data_list(
  data_list = list(
    patient = patient, admission = admission,
    diagnosis = diagnosis, lab = lab
  ),
  entry_expr = any(icd == "E11"),
  entry_level = "patient_id",
  anchor_expr = any(icd == "E11"),
  anchor_level = "date",
  anchor_window = "from_first_anchor",
  patient_id_map = "pid",
  visit_id_map = c(admission = "vid", diagnosis = "vid", lab = "vid"),
  date_map = c(admission = "admit_day", diagnosis = "dx_day", lab = "lab_day"),
  output = "joined"
)

cat("Joined data:", nrow(joined_result), "rows,", ncol(joined_result), "columns\n")
#> Joined data: 9 rows, 7 columns
knitr::kable(head(joined_result), caption = "Joined Output")
Joined Output
patient_id age gender visit_id date icd HbA1c
1 51 M 101 1 E11 8.5
1 51 M 102 30 I10 7.2
2 72 M 104 45 E11 7.8
3 38 M 105 10 E11 8.0
3 38 M 106 60 NA 6.9
5 75 F 108 20 E11 8.2

Summary

Function Clinical Application
exclusion_count() CONSORT-style cohort flow documentation
screen_data_list() Multi-table cohort screening with audit trails
merge_by_substring() Substring matching for diagnosis/procedure coding
merge_by_range() Time-window matching for longitudinal data
to_wide() Long-to-wide format conversion
split_multichoice() / combine_multichoice() Survey and symptom data processing
Utility functions String standardization, list operations, NA filling

mirror server hosted at Truenetwork, Russian Federation.