The vald.extractor package provides a robust,
production-ready pipeline for extracting, cleaning, and analyzing VALD
ForceDecks data across multiple sports. This vignette demonstrates the
complete workflow from API authentication to publication-ready
visualizations.
First, set your VALD API credentials and extract test/trial data:
library(vald.extractor)
# Set credentials
valdr::set_credentials(
client_id = "your_client_id",
client_secret = "your_client_secret",
tenant_id = "your_tenant_id",
region = "aue"
)
# Fetch data from 2020 onwards in chunks of 100 tests
vald_data <- fetch_vald_batch(
start_date = "2020-01-01T00:00:00Z",
chunk_size = 100,
verbose = TRUE
)
# Extract components
tests_df <- vald_data$tests
trials_df <- vald_data$trials
cat("Extracted", nrow(tests_df), "tests and", nrow(trials_df), "trials\n")Why chunking matters: Without chunking, large organizations with 5000+ tests will experience API timeout errors. The chunked approach processes 100 tests at a time, with fault-tolerant error handling that logs issues without halting the entire extraction.
Retrieve athlete profiles and group memberships via OAuth2:
# Fetch raw metadata
metadata <- fetch_vald_metadata(
client_id = "your_client_id",
client_secret = "your_client_secret",
tenant_id = "your_tenant_id",
region = "aue"
)
# Standardize: unnest group memberships and create unified athlete records
athlete_metadata <- standardize_vald_metadata(
profiles = metadata$profiles,
groups = metadata$groups
)
head(athlete_metadata)The VALD API stores group memberships as a nested array
(groupIds). The standardize_vald_metadata()
function:
Result: A clean metadata table where
all_group_names contains “Football, U18, Elite” for an
athlete in multiple groups.
Map inconsistent team names to standardized sports categories:
athlete_metadata <- classify_sports(
data = athlete_metadata,
group_col = "all_group_names",
output_col = "sports_clean"
)
# Inspect the mapping
table(athlete_metadata$sports_clean)The Value Add: This regex-based classification is the core innovation. Organizations often have:
Without this automation, analysts spend hours manually categorizing athletes. The package includes patterns for 15+ sports and can be easily extended.
Combine trials into tests, pivot to wide format, and merge with metadata:
library(dplyr)
# Join trials and tests
all_data <- left_join(trials_df, tests_df, by = c("testId", "athleteId"))
# Aggregate trials and pivot to wide format
structured_test_data <- all_data %>%
group_by(athleteId, testId, testType, recordedUTC,
recordedDateOffset, trialLimb, definition_name) %>%
summarise(
mean_result = mean(as.numeric(value), na.rm = TRUE),
mean_weight = mean(as.numeric(weight), na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
TestTimestampUTC = lubridate::ymd_hms(recordedUTC),
TestTimestampLocal = TestTimestampUTC + lubridate::minutes(recordedDateOffset),
Testdate = as.Date(TestTimestampLocal)
) %>%
select(athleteId, Testdate, testId, testType, trialLimb,
definition_name, mean_result, mean_weight) %>%
tidyr::pivot_wider(
id_cols = c(athleteId, Testdate, testId, mean_weight),
names_from = c(definition_name, trialLimb, testType),
values_from = mean_result,
names_glue = "{definition_name}_{trialLimb}_{testType}"
) %>%
rename(Weight_on_Test_Day = mean_weight)
# Join with metadata
final_analysis_data <- structured_test_data %>%
mutate(profileId = as.character(athleteId)) %>%
left_join(
athlete_metadata %>% mutate(profileId = as.character(profileId)),
by = "profileId"
) %>%
mutate(
Testdate = as.Date(Testdate),
dateofbirth = as.Date(dateOfBirth),
age = as.numeric((Testdate - dateofbirth) / 365.25),
sports = sports_clean
)
cat("Final dataset:", nrow(final_analysis_data), "rows with",
ncol(final_analysis_data), "columns\n")The “Don’t Repeat Yourself” (DRY) principle in action:
# Split into separate datasets per test type
test_datasets <- split_by_test(
data = final_analysis_data,
metadata_cols = c("profileId", "sex", "Testdate", "dateofbirth",
"age", "testId", "Weight_on_Test_Day", "sports")
)
# Access individual test types
cmj_data <- test_datasets$CMJ
dj_data <- test_datasets$DJ
# Crucially: column names are now generic
head(names(cmj_data))
# "profileId", "sex", "Testdate", "PEAK_FORCE_Both", "JUMP_HEIGHT_Both", ...
# Note: "_CMJ" suffix has been removed!Why this matters: You can now write one analysis function that works for all test types:
analyze_peak_force <- function(test_data) {
summary(test_data$PEAK_FORCE_Both) # Works for CMJ, DJ, ISO, etc.
}
# Apply to all test types
lapply(test_datasets, analyze_peak_force)Without suffix removal, you’d need separate code for
PEAK_FORCE_Both_CMJ, PEAK_FORCE_Both_DJ,
etc.
Fix missing or incorrect demographic data:
# Create an Excel file with: profileId, sex, dateOfBirth
# Example: corrections.xlsx with rows like:
# profileId sex dateOfBirth
# abc123 Male 1995-03-15
# def456 Female 1998-07-22
cmj_data <- patch_metadata(
data = cmj_data,
patch_file = "corrections.xlsx",
patch_sheet = 1,
id_col = "profileId",
fields_to_patch = c("sex", "dateOfBirth")
)
# Verify corrections
table(cmj_data$sex) # "Unknown" values should now be fixedCreate publication-ready summary tables:
cmj_summary <- summary_vald_metrics(
data = cmj_data,
group_vars = c("sex", "sports"),
exclude_cols = c("profileId", "testId", "Testdate", "dateofbirth", "age")
)
# View summary
print(cmj_summary)
# Export to CSV
write.csv(cmj_summary, "cmj_summary_by_sport_sex.csv", row.names = FALSE)Output example:
sex sports PEAK_FORCE_Both_Mean PEAK_FORCE_Both_SD PEAK_FORCE_Both_CV PEAK_FORCE_Both_N
Male Football 2450.32 245.67 10.02 45
Male Basketball 2310.45 198.23 8.58 32
Female Football 1980.12 187.45 9.47 38
Track performance over time:
library(ggplot2)
# Plot CMJ peak force trends by athlete
plot_vald_trends(
data = cmj_data,
date_col = "Testdate",
metric_col = "PEAK_FORCE_Both",
group_col = "profileId",
facet_col = "sex",
title = "CMJ Peak Force Trends by Athlete",
smooth = TRUE
)
# Plot sport-level averages over time
sport_trends <- cmj_data %>%
group_by(Testdate, sports) %>%
summarise(avg_force = mean(PEAK_FORCE_Both, na.rm = TRUE), .groups = "drop")
plot_vald_trends(
data = sport_trends,
date_col = "Testdate",
metric_col = "avg_force",
group_col = "sports",
title = "Average CMJ Peak Force by Sport Over Time"
)Create boxplots for cross-sectional comparisons:
plot_vald_compare(
data = cmj_data,
metric_col = "PEAK_FORCE_Both",
group_col = "sports",
fill_col = "sex",
title = "CMJ Peak Force Comparison by Sport and Sex"
)
# Compare jump height
plot_vald_compare(
data = cmj_data,
metric_col = "JUMP_HEIGHT_Both",
group_col = "sports",
fill_col = "sex",
title = "CMJ Jump Height Comparison"
)Analyze multiple test types simultaneously:
# Define a function to extract a common metric across test types
compare_metric_across_tests <- function(test_datasets, metric = "PEAK_FORCE_Both") {
results <- lapply(names(test_datasets), function(test_name) {
test_data <- test_datasets[[test_name]]
if (metric %in% names(test_data)) {
data.frame(
testType = test_name,
metric = metric,
mean = mean(test_data[[metric]], na.rm = TRUE),
sd = sd(test_data[[metric]], na.rm = TRUE),
n = sum(!is.na(test_data[[metric]]))
)
}
})
do.call(rbind, results)
}
# Compare peak force across CMJ, DJ, and ISO
force_comparison <- compare_metric_across_tests(test_datasets, "PEAK_FORCE_Both")
print(force_comparison)# Weekly refresh script
library(vald.extractor)
# Fetch only new data since last update
last_update <- "2024-01-01T00:00:00Z"
new_data <- fetch_vald_batch(
start_date = last_update,
chunk_size = 100
)
# Append to existing database
load("vald_database.RData")
updated_tests <- rbind(existing_tests, new_data$tests)
updated_trials <- rbind(existing_trials, new_data$trials)
save(updated_tests, updated_trials, file = "vald_database.RData")The chunked extraction automatically logs errors without halting:
Store your sports classification rules in a separate config file:
The vald.extractor package transforms raw VALD API data
into analysis-ready datasets with:
This workflow is production-tested with 10,000+ tests across 15+ sports and is designed for CRAN submission.
ggplot2
extensionsFor issues or feature requests, visit: GitHub Issues