clinpubr
is an R package designed to streamline the
workflow from clinical data processing to publication-ready outputs. It
provides tools for clinical data cleaning, significant result screening,
and generating tables/figures suitable for medical journals.
You can install clinpubr
from CRAN with:
install.packages("clinpubr")
Some functions require additional packages for full functionality. The package will automatically prompt you to install missing packages when needed. If you want to install the package with all dependencies, you can use:
install.packages("clinpubr", dependencies = TRUE)
library(clinpubr)
# Sample messy data
<- data.frame(values = c("12.3", "0..45", " 67 ", "", "abandon"))
messy_data <- value_initial_cleaning(messy_data$values)
clean_data print(clean_data)
#> [1] "12.3" "0.45" "67" NA "abandon"
# Sample messy data
<- c("1.2(XXX)", "1.5", "0.82", "5-8POS", "NS", "FULL")
x print(check_nonnum(x))
#> [1] "1.2(XXX)" "5-8POS" "NS" "FULL"
This function filters out non-numerical values, which helps you choose the appropriate method to handle them.
# Sample messy data
<- c("1.2(XXX)", "1.5", "0.82", "5-8POS", "NS", "FULL")
x print(extract_num(x))
#> [1] 1.20 1.50 0.82 5.00 NA NA
print(extract_num(x,
res_type = "first", # Extract the first number
multimatch2na = TRUE, # Convert illegal multiple matches to NA
zero_regexp = "NEG|NS", # Convert "NEG" and "NS" (matched using regex) to 0
max_regexp = "FULL", # Convert "FULL" (matched using regex) to some specified quantile
max_quantile = 0.95
))#> [1] 1.20 1.50 0.82 NA 0.00 1.47
to_date()
: Convert text to date, can handle
mixed format.unit_view()
and
unit_standardize()
: Provide a pipeline to
standardize conflicting units.cut_by()
: Split numerics into factors, offers
a variety of splitting options and auto labeling.data(cancer, package = "survival")
# Screening for potential findings with regression models in the cancer dataset
<- regression_scan(cancer, y = "status", time = "time", save_table = FALSE)
scan_result #> Taking all variables as predictors
::kable(scan_result) knitr
predictor | nvalid | original.HR | original.pval | original.padj | logarithm.HR | logarithm.pval | logarithm.padj | categorized.HR | categorized.pval | categorized.padj | rcs.overall.pval | rcs.overall.padj | rcs.nonlinear.pval | rcs.nonlinear.padj | best.var.trans | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | ph.ecog | 227 | 1.6095320 | 0.0000269 | 0.0002154 | NA | NA | NA | NA | 0.0001530 | 0.0012237 | NA | NA | NA | NA | original |
6 | pat.karno | 225 | 0.9803456 | 0.0002824 | 0.0011296 | 0.2709544 | 0.0003071 | 0.0015356 | 0.5755627 | 0.0006608 | 0.0026431 | 0.0025848 | 0.0155086 | 0.5908952 | 0.8863427 | original |
3 | sex | 228 | 0.5880028 | 0.0014912 | 0.0039766 | NA | NA | NA | 0.5880028 | 0.0014912 | 0.0039766 | NA | NA | NA | NA | categorized |
5 | ph.karno | 227 | 0.9836863 | 0.0049579 | 0.0099157 | 0.3184168 | 0.0079468 | 0.0198669 | 0.6352465 | 0.0077670 | 0.0155339 | 0.0128462 | 0.0385385 | 0.2307961 | 0.6848245 | original |
2 | age | 228 | 1.0188965 | 0.0418531 | 0.0669650 | 3.0256773 | 0.0466926 | 0.0778209 | 1.1440790 | 0.3910647 | 0.3957558 | 0.0825447 | 0.1650894 | 0.3424123 | 0.6848245 | original |
1 | inst | 227 | 0.9903692 | 0.3459838 | 0.4613117 | 0.9292046 | 0.3181432 | 0.3976790 | 0.8384047 | 0.2600040 | 0.3466720 | 0.8175277 | 0.8707131 | 0.9839705 | 0.9839705 | categorized |
7 | meal.cal | 181 | 0.9998762 | 0.5929402 | 0.6776459 | 0.9141580 | 0.6128095 | 0.6128095 | 0.8620604 | 0.3957558 | 0.3957558 | 0.8707131 | 0.8707131 | 0.8227256 | 0.9839705 | categorized |
8 | wt.loss | 214 | 1.0013201 | 0.8281974 | 0.8281974 | NA | NA | NA | 1.3190185 | 0.0909098 | 0.1454557 | 0.1128907 | 0.1693361 | 0.0514936 | 0.3089618 | rcs.nonlinear |
<- get_var_types(mtcars, strata = "vs") # Automatically infer variable types
var_types print(var_types)
#> $factor_vars
#> [1] "cyl" "vs" "am" "gear"
#>
#> $exact_vars
#> [1] "cyl" "gear"
#>
#> $nonnormal_vars
#> [1] "drat" "carb"
#>
#> $omit_vars
#> NULL
#>
#> $strata
#> [1] "vs"
#>
#> attr(,"class")
#> [1] "var_types"
<- baseline_table(mtcars,
tables var_types = var_types, contDigits = 1, save_table = FALSE,
filename = "baseline.csv", seed = 1 # set seed for simulated fisher exact test
)::kable(tables$baseline) # Display the table knitr
Overall | vs: 0 | vs: 1 | p | test | |
---|---|---|---|---|---|
n | 32 | 18 | 14 | ||
mpg (mean (SD)) | 20.1 (6.0) | 16.6 (3.9) | 24.6 (5.4) | <0.001 | |
cyl (%) | <0.001 | exact | |||
4 | 11 (34.4) | 1 (5.6) | 10 (71.4) | ||
6 | 7 (21.9) | 3 (16.7) | 4 (28.6) | ||
8 | 14 (43.8) | 14 (77.8) | 0 (0.0) | ||
disp (mean (SD)) | 230.7 (123.9) | 307.1 (106.8) | 132.5 (56.9) | <0.001 | |
hp (mean (SD)) | 146.7 (68.6) | 189.7 (60.3) | 91.4 (24.4) | <0.001 | |
drat (median [IQR]) | 3.7 [3.1, 3.9] | 3.2 [3.1, 3.7] | 3.9 [3.7, 4.1] | 0.013 | nonnorm |
wt (mean (SD)) | 3.2 (1.0) | 3.7 (0.9) | 2.6 (0.7) | 0.001 | |
qsec (mean (SD)) | 17.8 (1.8) | 16.7 (1.1) | 19.3 (1.4) | <0.001 | |
am = 1 (%) | 13 (40.6) | 6 (33.3) | 7 (50.0) | 0.556 | |
gear (%) | 0.003 | exact | |||
3 | 15 (46.9) | 12 (66.7) | 3 (21.4) | ||
4 | 12 (37.5) | 2 (11.1) | 10 (71.4) | ||
5 | 5 (15.6) | 4 (22.2) | 1 (7.1) | ||
carb (median [IQR]) | 2.0 [2.0, 4.0] | 4.0 [2.2, 4.0] | 1.5 [1.0, 2.0] | <0.001 | nonnorm |
data(cancer, package = "survival")
# Performing cox regression, which is inferred by `y` and `time`
<- rcs_plot(cancer, x = "age", y = "status", time = "time", covars = c("sex", "ph.karno"), save_plot = FALSE)
p #> Warning in rcs_plot(cancer, x = "age", y = "status", time = "time", covars =
#> c("sex", : 1 incomplete cases excluded.
plot(p)
data(cancer, package = "survival")
# Generating interaction plot of both linear and RCS models
<- interaction_plot(cancer,
p y = "status", time = "time", predictor = "age",
group_var = "sex", save_plot = FALSE
)plot(p$lin)
plot(p$rcs)
data(cancer, package = "survival")
$dead <- cancer$status == 2 # Preparing a binary variable for logistic regression
cancer$`age per 1 sd` <- c(scale(cancer$age)) # Standardizing age
cancer
# Performing multivairate logistic regression
<- regression_forest(cancer,
p1 model_vars = c("age per 1 sd", "sex", "wt.loss"), y = "dead",
as_univariate = FALSE, save_plot = FALSE
)plot(p1)
<- regression_forest(
p2
cancer,model_vars = list(
Crude = c("age per 1 sd"),
Model1 = c("age per 1 sd", "sex"),
Model2 = c("age per 1 sd", "sex", "wt.loss")
),y = "dead",
save_plot = FALSE
)plot(p2)
data(cancer, package = "survival")
# coxph model with time assigned
<- subgroup_forest(cancer,
p subgroup_vars = c("age", "sex", "wt.loss"), x = "ph.ecog", y = "status",
time = "time", covars = "ph.karno", ticks_at = c(1, 2), save_plot = FALSE
)plot(p)
# Building models with example data
data(cancer, package = "survival")
<- kidney
df $dead <- ifelse(df$time <= 100 & df$status == 0, NA, df$time <= 100)
df<- na.omit(df[, -c(1:3)])
df
<- glm(dead ~ age + frail, family = binomial(), data = df)
model0 <- glm(dead ~ ., family = binomial(), data = df)
model1 $base_pred <- predict(model0, type = "response")
df$full_pred <- predict(model1, type = "response")
df
# Generating most of the useful plots and metrics for model comparison
<- classif_model_compare(df, "dead", c("base_pred", "full_pred"), save_output = FALSE)
results #> Assuming 'TRUE' is [Event] and 'FALSE' is [non-Event]
::kable(results$metric_table) knitr
Model | AUC | Accuracy | Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | F1 | Kappa | Brier | cutoff | Youden | HosLem | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | full_pred | 0.915 (0.847, 0.984) | 0.839 | 0.8 | 0.889 | 0.903 | 0.774 | 0.848 | 0.677 | 0.114 | 0.626 | 0.689 | 0.944 |
1 | base_pred | 0.822 (0.711, 0.933) | 0.806 | 0.8 | 0.815 | 0.848 | 0.759 | 0.824 | 0.610 | 0.171 | 0.490 | 0.615 | 0.405 |
plot(results$roc_plot)
plot(results$calibration_plot)
plot(results$dca_plot)
# Generating a dummy importance vector
set.seed(5)
<- runif(20, 0.2, 0.6)^5
dummy_importance names(dummy_importance) <- paste0("var", 1:20)
# Plotting variable importance, keeping only top 15 and splitting at 10
<- importance_plot(dummy_importance, top_n = 15, split_at = 10, save_plot = FALSE)
p plot(p)
#> Warning: Removed 1 row containing missing values or values outside the scale range
#> (`geom_bar()`).
For detailed usage, refer to the package vignettes (coming soon) or the GitHub repository.
Bug reports and feature requests are welcome via the issue tracker.
clinpubr
is licensed under GPL (>= 3).