Advanced Data Operations

Introduction

Clinical research often requires complex data manipulation: multi-source integration, fuzzy matching, temporal alignment, and format conversion. This vignette covers the advanced tools in clinpubr for these tasks:

Cohort Screening — CONSORT-style exclusion tracking
Data Merging — Substring and time-range matching
Data Transformation — Long/wide format, multi-choice data
Utility Functions — String manipulation, list operations
Multi-Table Screening — screen_data_list() for complex cohort workflows

Cohort Screening with Exclusion Tracking

exclusion_count() creates a CONSORT-style flow diagram, tracking how many patients are excluded at each stage:

set.seed(123)

patients <- data.frame(
  patient_id = 1:100,
  age = round(rnorm(100, 60, 15)),
  gender = sample(c("M", "F"), 100, replace = TRUE),
  has_baseline_data = sample(c(TRUE, FALSE), 100, prob = c(0.9, 0.1), replace = TRUE),
  meets_inclusion = sample(c(TRUE, FALSE), 100, prob = c(0.8, 0.2), replace = TRUE),
  complete_followup = sample(c(TRUE, FALSE), 100, prob = c(0.85, 0.15), replace = TRUE)
)

exclusion_summary <- exclusion_count(
  patients,
  age < 50,
  gender != "M",
  !has_baseline_data,
  !meets_inclusion,
  !complete_followup
)

knitr::kable(exclusion_summary, caption = "CONSORT-Style Cohort Flow")

CONSORT-Style Cohort Flow
Criteria	N
Initial N	100
age < 50	16
gender != “M”	44
!has_baseline_data	7
!meets_inclusion	6
!complete_followup	7
Final N	20


cat("Retention rate:", round(100 * exclusion_summary$N[nrow(exclusion_summary)] / exclusion_summary$N[1], 1), "%\n")
#> Retention rate: 20 %

Customize exclusion criteria names:

exclusion_flow <- exclusion_count(
  patients,
  age < 18 | age > 80,
  !has_baseline_data,
  !meets_inclusion,
  .criteria_names = c(
    "Age outside 18-80 range",
    "Missing baseline data",
    "Does not meet inclusion criteria"
  )
)

knitr::kable(exclusion_flow, caption = "Custom Exclusion Flow")

Custom Exclusion Flow
Criteria	N
Initial N	100
Age outside 18-80 range	9
Missing baseline data	10
Does not meet inclusion criteria	14
Final N	67

Data Merging Strategies

Merge by Substring Matching

merge_by_substring() matches records when exact identifiers don’t align (e.g., mapping free-text diagnoses to ICD codes):

medical_terms <- data.frame(
  match_term = c(
    "Type 2 Diabetes", "Hypertension", "Coronary Artery Disease",
    "Coronary Disease", "Chronic Kidney Disease", "Heart Failure", "Atrial Fibrillation"
  ),
  standard_term = c(
    "Type 2 Diabetes Mellitus", "Hypertension", "Coronary Artery Disease",
    "Coronary Artery Disease", "Chronic Kidney Disease", "Heart Failure", "Atrial Fibrillation"
  ),
  icd_code = c("E11", "I10", "I25", "I25", "N18", "I50", "I48"),
  category = c(
    "Endocrine", "Cardiovascular", "Cardiovascular", "Cardiovascular",
    "Renal", "Cardiovascular", "Cardiovascular"
  )
)

patient_diagnoses <- data.frame(
  patient_id = 1:10,
  diagnosis_text = c(
    "Severe Type 2 Diabetes", "Type 2 Diabetes Mellitus",
    "Type 2 Diabetes with Complications", "Essential Hypertension",
    "Hypertensive disease", "CAD - Coronary Artery Disease",
    "Coronary disease", "CKD Stage 3",
    "Congestive Heart Failure", "Heart failure chronic"
  )
)

merged_substring <- merge_by_substring(
  data = patient_diagnoses,
  key_df = medical_terms,
  search_col = "diagnosis_text",
  key_col = "match_term",
  value_cols = c("standard_term", "icd_code", "category")
)

knitr::kable(medical_terms, caption = "Medical Terms Table")

Medical Terms Table
match_term	standard_term	icd_code	category
Type 2 Diabetes	Type 2 Diabetes Mellitus	E11	Endocrine
Hypertension	Hypertension	I10	Cardiovascular
Coronary Artery Disease	Coronary Artery Disease	I25	Cardiovascular
Coronary Disease	Coronary Artery Disease	I25	Cardiovascular
Chronic Kidney Disease	Chronic Kidney Disease	N18	Renal
Heart Failure	Heart Failure	I50	Cardiovascular
Atrial Fibrillation	Atrial Fibrillation	I48	Cardiovascular

knitr::kable(merged_substring, caption = "Diagnoses with Mapped ICD Codes")

Diagnoses with Mapped ICD Codes
diagnosis_text	patient_id	standard_term	icd_code	category
CAD - Coronary Artery Disease	6	Coronary Artery Disease	I25	Cardiovascular
CKD Stage 3	8	NA	NA	NA
Congestive Heart Failure	9	Heart Failure	I50	Cardiovascular
Coronary disease	7	Coronary Artery Disease	I25	Cardiovascular
Essential Hypertension	4	Hypertension	I10	Cardiovascular
Heart failure chronic	10	Heart Failure	I50	Cardiovascular
Hypertensive disease	5	NA	NA	NA
Severe Type 2 Diabetes	1	Type 2 Diabetes Mellitus	E11	Endocrine
Type 2 Diabetes Mellitus	2	Type 2 Diabetes Mellitus	E11	Endocrine
Type 2 Diabetes with Complications	3	Type 2 Diabetes Mellitus	E11	Endocrine

Merge by Range (Time-Based Matching)

merge_by_range() matches events within specific time windows (e.g., lab results during hospitalization):

patient_visits <- data.frame(
  patient_id = rep(1:3, each = 2),
  visit_id = 1:6,
  visit_start = as.Date(c(
    "2023-01-01", "2023-06-01", "2023-02-01",
    "2023-07-01", "2023-03-01", "2023-08-01"
  )),
  visit_end = as.Date(c(
    "2023-01-10", "2023-06-10", "2023-02-10",
    "2023-07-10", "2023-03-10", "2023-08-10"
  ))
)

lab_results <- data.frame(
  lab_id = 1:6,
  patient_id = c(1, 1, 2, 2, 3, 3),
  test_date = as.Date(c(
    "2023-01-05", "2023-06-05", "2023-02-03",
    "2023-07-08", "2023-03-15", "2023-08-05"
  )),
  test_name = c("Glucose", "HbA1c", "Glucose", "Creatinine", "Glucose", "Lipid panel"),
  result = round(rnorm(6, 100, 15), 1)
)

merged_range <- merge_by_range(
  x = patient_visits, y = lab_results,
  by = "patient_id",
  x_start = "visit_start", x_end = "visit_end",
  y_val = "test_date"
)

knitr::kable(merged_range, caption = "Labs Matched to Visit Windows")

Labs Matched to Visit Windows
patient_id	visit_id	visit_start	visit_end	lab_id	test_date	test_name	result	since_start
1	1	2023-01-01	2023-01-10	1	2023-01-05	Glucose	89.3	4
1	2	2023-06-01	2023-06-10	2	2023-06-05	HbA1c	88.7	4
2	3	2023-02-01	2023-02-10	3	2023-02-03	Glucose	85.9	2
2	4	2023-07-01	2023-07-10	4	2023-07-08	Creatinine	84.2	7
3	6	2023-08-01	2023-08-10	6	2023-08-05	Lipid panel	105.0	4
NA	NA	NA	NA	5	2023-03-15	Glucose	93.4	NA

Data Transformation

Long to Wide Format

to_wide() converts long-format data (one row per measurement) to wide format (one row per patient) to facilitate analysis:

long_labs <- data.frame(
  patient_id = rep(1:5, each = 3),
  visit = rep(c(1, 2), times = c(8, 7)),
  test = rep(c("glucose", "creatinine", "cholesterol"), 5),
  value = round(rnorm(15, 100, 20), 1)
)

wide_labs <- to_wide(
  df = long_labs,
  keys = c("patient_id", "visit"),
  item_col = "test",
  value_col = "value"
)

knitr::kable(head(long_labs), caption = "Laboratory Data - Long Format")

Laboratory Data - Long Format
patient_id	visit	test	value
1	1	glucose	59.7
1	1	creatinine	104.2
1	1	cholesterol	124.7
2	1	glucose	140.8
2	1	creatinine	126.0
2	1	cholesterol	115.1

knitr::kable(head(wide_labs), caption = "Laboratory Data - Wide Format")

Laboratory Data - Wide Format
patient_id	visit	cholesterol	creatinine	glucose
1	1	124.7	104.2	59.7
2	1	115.1	126.0	140.8
3	1	NA	88.0	65.5
3	2	93.0	NA	NA
4	2	74.8	97.9	114.1
5	2	104.7	118.2	133.7

Multi-Choice Data

split_multichoice() splits comma-separated multi-choice columns into binary indicators; combine_multichoice() recombines them:

set.seed(456)
survey_data <- data.frame(
  id = 1:20,
  symptoms = sapply(1:20, function(x) {
    paste(sample(
      c("fever", "cough", "headache", "fatigue"),
      sample(1:4, 1)
    ), collapse = ",")
  }),
  comorbidities = sapply(1:20, function(x) {
    paste(sample(
      c("diabetes", "hypertension"),
      sample(1:2, 1)
    ), collapse = ",")
  })
)

symptoms_split <- split_multichoice(
  survey_data,
  quest_cols = c("symptoms", "comorbidities"),
  split = ",",
  remove_space = FALSE
)

knitr::kable(head(survey_data), caption = "Multi-Choice Data")

Multi-Choice Data
id	symptoms	comorbidities
1	fever	diabetes
2	cough,fever,fatigue	diabetes,hypertension
3	fever,cough,fatigue	hypertension
4	fever,headache	diabetes
5	headache,fatigue	hypertension,diabetes
6	fatigue,cough,headache	diabetes


knitr::kable(head(symptoms_split), caption = "Split Multi-Choice Data")

Split Multi-Choice Data
id	symptoms_fever	symptoms_cough	symptoms_fatigue	symptoms_headache	comorbidities_diabetes	comorbidities_hypertension
1	TRUE	FALSE	FALSE	FALSE	TRUE	FALSE
2	TRUE	TRUE	TRUE	FALSE	TRUE	TRUE
3	TRUE	TRUE	TRUE	FALSE	FALSE	TRUE
4	TRUE	FALSE	FALSE	TRUE	TRUE	FALSE
5	FALSE	FALSE	TRUE	TRUE	TRUE	TRUE
6	FALSE	TRUE	TRUE	TRUE	TRUE	FALSE


combined <- combine_multichoice(
  symptoms_split,
  quest_cols = list(
    respiratory = c("symptoms_cough", "symptoms_fatigue"),
    systemic = c("symptoms_fever", "symptoms_headache")
  ),
  sep = ","
)

knitr::kable(head(combined), caption = "Combined Symptom Groups")

Combined Symptom Groups
id	comorbidities_diabetes	comorbidities_hypertension	respiratory	systemic
1	TRUE	FALSE		fever
2	TRUE	TRUE	cough,fatigue	fever
3	FALSE	TRUE	cough,fatigue	fever
4	TRUE	FALSE		fever,headache
5	TRUE	TRUE	fatigue	headache
6	TRUE	FALSE	cough,fatigue	headache

Utility Functions

String Manipulation

common_prefix() extracts the common prefix from a character vector, could be used when processing hospital exports to identify files from the same patient or batch processing multi-site survey data by site prefix.

file_names <- c("patient_001_lab.csv", "patient_001_visit.csv", "patient_001_demo.csv")
common_prefix(file_names)
#> [1] "patient_001_"

str_match_replace() replaces matched patterns in strings with specified replacements, could be used when standardizing lab test names across hospitals or unifying option labels across translated questionnaires.

test_names <- c("Glucose_Fasting", "Glucose_Random", "Cholesterol_Total", "LDL_Calculated")
standardized <- str_match_replace(
  x = test_names,
  to_match = c("Glucose", "Cholesterol", "LDL"),
  to_replace = c("GLU", "CHOL", "LDL_CHOL")
)

knitr::kable(data.frame(Original = test_names, Standardized = standardized),
  caption = "Test Name Standardization"
)

Test Name Standardization
Original	Standardized
Glucose_Fasting	GLU_Fasting
Glucose_Random	GLU_Random
Cholesterol_Total	CHOL_Total
LDL_Calculated	LDL_CHOL_Calculated

List and Data Frame Operations

add_lists() adds corresponding elements of two lists element-wise, could be used when aggregating disease incidence counts across time periods or summing response frequencies across survey sites.

list1 <- list(diabetes = 10, hypertension = 20, asthma = 5)
list2 <- list(diabetes = 15, hypertension = 25, asthma = 8)
add_lists(list1, list2)
#> $diabetes
#> [1] 25
#> 
#> $hypertension
#> [1] 45
#> 
#> $asthma
#> [1] 13

merge_ordered_vectors() merges multiple ordered vectors preserving unique elements and original ordering, could be used when combining inclusion criteria lists from multi-site studies or merging question orders across questionnaire versions.

sites <- list(
  c("Diabetes", "Hypertension", "Heart Failure", "CKD"),
  c("Hypertension", "COPD", "Diabetes", "Cancer"),
  c("CKD", "Diabetes", "Stroke", "Hypertension")
)
merge_ordered_vectors(sites)
#> [1] "COPD"          "Diabetes"      "Stroke"        "Hypertension" 
#> [5] "Heart Failure" "Cancer"        "CKD"

replace_elements() replaces specified elements in a vector with new values, could be used when correcting data entry errors in clinical data or harmonizing option codes across survey interviewers.

sample_data <- data.frame(
  group = c("Cntrol", "Treetment", "Placebo", "Cntrol", "Treatment"),
  value = 1:5
)
recoded <- replace_elements(
  x = sample_data$group,
  from = c("Cntrol", "Treetment"),
  to = c("Control", "Treatment")
)
knitr::kable(data.frame(Original = sample_data$group, Recoded = recoded),
  caption = "Group Recoding"
)

Group Recoding
Original	Recoded
Cntrol	Control
Treetment	Treatment
Placebo	Placebo
Cntrol	Control
Treatment	Treatment

fill_with_last() fills missing values with the last valid observation (LOCF), could be used when handling merged cells from Excel imports of EMR data or filling respondent demographics across repeated survey measures.

time_series <- c(120, NA, NA, 125, NA, 130, NA, NA)
knitr::kable(data.frame(Original = time_series, Filled = fill_with_last(time_series)),
  caption = "Last Observation Carried Forward"
)

Last Observation Carried Forward
Original	Filled
120	120
NA	120
NA	120
125	125
NA	125
130	130
NA	130
NA	130

Multi-Table Cohort Screening

screen_data_list() filters and joins multiple tables based on clinical criteria, with full audit trails. This is essential for retrospective EHR studies.

Scenario: Building a Diabetes Cohort

# Patient demographics
patient <- data.frame(
  pid = 1:50,
  age = sample(25:75, 50, replace = TRUE),
  gender = sample(c("M", "F"), 50, replace = TRUE)
)

# Admission records
admission <- data.frame(
  pid = c(1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = 101:115,
  admit_day = c(1, 30, 5, 45, 10, 60, 15, 20, 25, 35, 40, 50, 55, 65, 70)
)

# Diagnosis records
diagnosis <- data.frame(
  pid = c(1, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = c(101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115),
  dx_day = c(1, 30, 5, 45, 10, 15, 20, 25, 35, 40, 50, 55, 65, 70),
  icd = c("E11", "I10", "N18", "E11", "E11", "I25", "E11", "J18", "E11", "I10", "E11", "N18", "E11", "I50")
)

# Lab results
lab <- data.frame(
  pid = c(1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = c(101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115),
  lab_day = c(1, 30, 5, 45, 10, 60, 15, 20, 25, 35, 40, 50, 55, 65, 70),
  HbA1c = c(8.5, 7.2, 9.1, 7.8, 8.0, 6.9, 7.5, 8.2, 7.0, 9.5, 7.8, 8.1, 7.3, 8.8, 7.6)
)

Screen and Filter

Select patients with diabetes (ICD: E11), keeping records from first diabetes diagnosis onward:

screened_data <- screen_data_list(
  data_list = list(
    patient = patient, admission = admission,
    diagnosis = diagnosis, lab = lab
  ),
  entry_expr = any(icd == "E11"),
  entry_level = "patient_id",
  anchor_expr = icd == "E11",
  anchor_level = "date",
  patient_id_map = "pid",
  visit_id_map = c(admission = "vid", diagnosis = "vid", lab = "vid"),
  date_map = c(admission = "admit_day", diagnosis = "dx_day", lab = "lab_day"),
  output = "list",
  return_audit = TRUE
)

knitr::kable(screened_data$audit$entry_scope, caption = "Entry Stage Audit")

Entry Stage Audit
step	table	before	after	removed
entry_scope	patient	50	7	43
entry_scope	admission	15	10	5
entry_scope	diagnosis	14	9	5
entry_scope	lab	15	10	5

knitr::kable(screened_data$audit$anchor_scope, caption = "Anchor Stage Audit")

Anchor Stage Audit
step	table	before	after	removed
anchor_scope	patient	7	7	0
anchor_scope	admission	10	9	1
anchor_scope	diagnosis	9	8	1
anchor_scope	lab	10	9	1

Review Filtered Data

knitr::kable(head(screened_data$data$patient), caption = "Filtered Patients")

Filtered Patients
	pid	age	gender
1	1	51	M
2	2	72	M
3	3	38	M
5	5	75	F
7	7	59	F
9	9	28	M

knitr::kable(screened_data$data$diagnosis, caption = "Filtered Diagnoses")

Filtered Diagnoses
	pid	vid	dx_day	icd
1	1	101	1	E11
2	1	102	30	I10
4	2	104	45	E11
5	3	105	10	E11
7	5	108	20	E11
9	7	110	35	E11
11	9	112	50	E11
13	11	114	65	E11

knitr::kable(screened_data$data$lab, caption = "Filtered Labs")

Filtered Labs
	pid	vid	lab_day	HbA1c
1	1	101	1	8.5
2	1	102	30	7.2
4	2	104	45	7.8
5	3	105	10	8.0
6	3	106	60	6.9
8	5	108	20	8.2
10	7	110	35	9.5
12	9	112	50	8.1
14	11	114	65	8.8

Joined Output

Alternatively, get a single joined data frame:

joined_result <- screen_data_list(
  data_list = list(
    patient = patient, admission = admission,
    diagnosis = diagnosis, lab = lab
  ),
  entry_expr = any(icd == "E11"),
  entry_level = "patient_id",
  anchor_expr = any(icd == "E11"),
  anchor_level = "date",
  anchor_window = "from_first_anchor",
  patient_id_map = "pid",
  visit_id_map = c(admission = "vid", diagnosis = "vid", lab = "vid"),
  date_map = c(admission = "admit_day", diagnosis = "dx_day", lab = "lab_day"),
  output = "joined"
)

cat("Joined data:", nrow(joined_result), "rows,", ncol(joined_result), "columns\n")
#> Joined data: 9 rows, 7 columns
knitr::kable(head(joined_result), caption = "Joined Output")

Joined Output
patient_id	age	gender	visit_id	date	icd	HbA1c
1	51	M	101	1	E11	8.5
1	51	M	102	30	I10	7.2
2	72	M	104	45	E11	7.8
3	38	M	105	10	E11	8.0
3	38	M	106	60	NA	6.9
5	75	F	108	20	E11	8.2

Summary

Function	Clinical Application
`exclusion_count()`	CONSORT-style cohort flow documentation
`screen_data_list()`	Multi-table cohort screening with audit trails
`merge_by_substring()`	Substring matching for diagnosis/procedure coding
`merge_by_range()`	Time-window matching for longitudinal data
`to_wide()`	Long-to-wide format conversion
`split_multichoice()` / `combine_multichoice()`	Survey and symptom data processing
Utility functions	String standardization, list operations, NA filling

	pid	vid	dx_day	icd
1	1	101	1	E11
2	1	102	30	I10
4	2	104	45	E11
5	3	105	10	E11
7	5	108	20	E11
9	7	110	35	E11
11	9	112	50	E11
13	11	114	65	E11

	pid	vid	lab_day	HbA1c
1	1	101	1	8.5
2	1	102	30	7.2
4	2	104	45	7.8
5	3	105	10	8.0
6	3	106	60	6.9
8	5	108	20	8.2
10	7	110	35	9.5
12	9	112	50	8.1
14	11	114	65	8.8

patient_id	age	gender	visit_id	date	icd	HbA1c
1	51	M	101	1	E11	8.5
1	51	M	102	30	I10	7.2
2	72	M	104	45	E11	7.8
3	38	M	105	10	E11	8.0
3	38	M	106	60	NA	6.9
5	75	F	108	20	E11	8.2

	pid	vid	dx_day	icd
1	1	101	1	E11
2	1	102	30	I10
4	2	104	45	E11
5	3	105	10	E11
7	5	108	20	E11
9	7	110	35	E11
11	9	112	50	E11
13	11	114	65	E11

	pid	vid	lab_day	HbA1c
1	1	101	1	8.5
2	1	102	30	7.2
4	2	104	45	7.8
5	3	105	10	8.0
6	3	106	60	6.9
8	5	108	20	8.2
10	7	110	35	9.5
12	9	112	50	8.1
14	11	114	65	8.8

patient_id	age	gender	visit_id	date	icd	HbA1c
1	51	M	101	1	E11	8.5
1	51	M	102	30	I10	7.2
2	72	M	104	45	E11	7.8
3	38	M	105	10	E11	8.0
3	38	M	106	60	NA	6.9
5	75	F	108	20	E11	8.2

	pid	vid	dx_day	icd
1	1	101	1	E11
2	1	102	30	I10
4	2	104	45	E11
5	3	105	10	E11
7	5	108	20	E11
9	7	110	35	E11
11	9	112	50	E11
13	11	114	65	E11

	pid	vid	lab_day	HbA1c
1	1	101	1	8.5
2	1	102	30	7.2
4	2	104	45	7.8
5	3	105	10	8.0
6	3	106	60	6.9
8	5	108	20	8.2
10	7	110	35	9.5
12	9	112	50	8.1
14	11	114	65	8.8

patient_id	age	gender	visit_id	date	icd	HbA1c
1	51	M	101	1	E11	8.5
1	51	M	102	30	I10	7.2
2	72	M	104	45	E11	7.8
3	38	M	105	10	E11	8.0
3	38	M	106	60	NA	6.9
5	75	F	108	20	E11	8.2