The lisat (Longitudinal Integration Site Analysis
Toolkit) package provides a comprehensive set of tools for the analysis
of longitudinal integration site data. This vignette demonstrates the
basic workflow, including data simulation, annotation, statistical
modeling, and visualization.
We will generate simulated chromosome integration site (IS) raw data to demonstrate the functionality of the package.
set.seed(12345)
n_rows <- 10000
sample_names <- c("Sample_A", "Sample_B", "Sample_C")
chr_list <- paste0(1:23)
# Generate random data
Sample <- sample(sample_names, size = n_rows, replace = TRUE)
SCount <- sample(1:1000, size = n_rows, replace = TRUE)
Chr <- sample(chr_list, size = n_rows, replace = TRUE)
Locus <- sample(1:150000000, size = n_rows, replace = TRUE)
IS_raw <- data.frame(
Sample = Sample,
SCount = SCount,
Chr = Chr,
Locus = Locus,
stringsAsFactors = FALSE
)
# Simulate some high-frequency clones (updated)
IS_raw$SCount[1:100] <- sample(500000:800000, 100, replace = TRUE)
head(IS_raw)
#> Sample SCount Chr Locus
#> 1 Sample_B 518557 1 113660784
#> 2 Sample_C 538402 2 74275020
#> 3 Sample_B 686867 23 149373139
#> 4 Sample_B 750400 21 34406576
#> 5 Sample_A 525756 2 39308499
#> 6 Sample_C 695199 21 129455667C:7bHOv5c3865012065-intro.R
Validate the data structure:
C:7bHOv5c3865012065-intro.R
Create a patient-timepoint mapping table for longitudinal analysis.
Patient_timepoint <- data.frame(
Sample_ID = c("Sample_A", "Sample_B", "Sample_C"),
Time_Point = c("3m", "12m", "24m"),
Patient_ID = rep("Pt1", 3),
stringsAsFactors = FALSE
)
head(Patient_timepoint)
#> Sample_ID Time_Point Patient_ID
#> 1 Sample_A 3m Pt1
#> 2 Sample_B 12m Pt1
#> 3 Sample_C 24m Pt1C:7bHOv5c3865012065-intro.R
Annotate the integration sites with genomic features. Note: This
step requires TxDb.Hsapiens.UCSC.hg38.knownGene and
org.Hs.eg.db packages.
if (requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE) &&
requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
IS_raw <- get_feature(IS_raw)
# Check for overlap with specific genomic elements
IS_raw <- Enhancer_check(IS_raw)
IS_raw <- Promotor_check(IS_raw)
IS_raw <- Safeharbor_check(IS_raw)
names(IS_raw)
} else {
message("Skipping annotation: Required annotation packages not installed.")
}
#> [1] "Sample" "SCount" "Chr"
#> [4] "Locus" "Clone_contribution" "in_gene"
#> [7] "in_exon" "in_intron" "nearest_distance"
#> [10] "start" "end" "width"
#> [13] "strand" "nearest_gene_name" "Enhancer"
#> [16] "Promotor" "Safeharbor"C:7bHOv5c3865012065-intro.R
Identify regions with recurrent integration sites.
CIS_top <- CIS(IS_raw = IS_raw, connect_distance = 50000)
#> CIS_network calculates interactions between integration sites in a network view
CIS_by_sample <- CIS_overlap(CIS_data = CIS_top, IS_raw = IS_raw)
CIS_by_sample| Chr | Locus | Gene | Total_dots | Central_connectivity | Dimention | Order | Gene_network | Sample_A | Sample_B | Sample_C |
|---|---|---|---|---|---|---|---|---|---|---|
| chr14 | 46622339 | RPL10L | 5 | 3 | 23715 | 10 | RPL10L | FALSE | TRUE | TRUE |
| chr6 | 119446557 | LOC105377975 | 4 | 3 | 29464 | 7 | LOC105377975 | TRUE | TRUE | FALSE |
| chr7 | 49323212 | LOC124901804 | 4 | 3 | 30996 | 7 | LOC124901804 | TRUE | TRUE | TRUE |
| chr20 | 38686365 | SLC32A1 | 4 | 3 | 33175 | 7 | SLC32A1; ARHGAP40 | FALSE | TRUE | TRUE |
| chr11 | 49305065 | FOLH1 | 4 | 2 | 32606 | 6 | FOLH1 | TRUE | TRUE | TRUE |
| chr17 | 96254683 | ZNF750 | 4 | 2 | 32661 | 6 | ZNF750 | FALSE | TRUE | TRUE |
| chr12 | 136321446 | ANHX | 4 | 1 | 51276 | 5 | ANHX | FALSE | TRUE | TRUE |
| chr8 | 4382729 | CSMD1 | 4 | 2 | 34476 | 5 | CSMD1 | TRUE | FALSE | TRUE |
| chr4 | 102665076 | MANBA | 4 | 2 | 41393 | 5 | MANBA | TRUE | TRUE | TRUE |
| chr19 | 117947388 | CENPBD2P | 3 | 2 | 2379 | 4 | CENPBD2P | TRUE | TRUE | TRUE |
C:7bHOv5c3865012065-intro.R
C:7bHOv5c3865012065-intro.R
if (requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE) &&
requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
# Adverse Event genes
ae_overlap <- is_in_AE_gene(IS_raw = IS_raw, Distance = 100000)
# Cancer Genes
cg_overlap <- is_in_CG_gene(IS_raw = IS_raw, threashold = 0.001)
print(cg_overlap)
# Immune Genes
immune_overlap <- is_in_immune_gene(IS_raw = IS_raw, threashold = 0.001)
}C:7bHOv5c3865012065-intro.R
Perform Population Matching Distribution (PMD) analysis.
PMD_data <- pmd_analysis(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
head(PMD_data)
#> UIS TOP_P Richness Eveness PMD Sample Time
#> Sample_A 3319 4.126260 11.69653 4.599022 -0.6259863 Sample_A 3m
#> Sample_B 3334 3.217942 11.70304 4.957718 -0.4442108 Sample_B 12m
#> Sample_C 3347 3.079673 11.70865 5.021079 -0.4134854 Sample_C 24m
# Plot Richness and Evenness
plot_richness_evenness(PMD_data = PMD_data)
# Analyze linked timepoints
linked_data <- Linked_timepoints(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
print(linked_data)C:7bHOv5c3865012065-intro.R
Analyze potential dominant clones using cumulative distribution.
IS_ratio <- fit_cum_simple(IS_raw$SCount)
print(Cumulative_curve(IS_ratio)) # Function for plotting
#> [[1]]#>
#> [[2]]
#> # A tibble: 8 × 8
#> estimate statistic p.value parameter conf.low conf.high method alternative
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 0.00260 -945. 7.42e-46 19 0.00212 0.00308 One Samp… two.sided
#> 2 0.0183 -255. 4.93e-35 19 0.0149 0.0216 One Samp… two.sided
#> 3 0.0319 -239. 1.65e-34 19 0.0266 0.0371 One Samp… two.sided
#> 4 0.0549 -255. 4.85e-35 19 0.0475 0.0622 One Samp… two.sided
#> 5 0.0954 -171. 9.20e-32 19 0.0843 0.106 One Samp… two.sided
#> 6 0.166 -101. 1.92e-27 19 0.149 0.184 One Samp… two.sided
#> 7 0.286 -58.9 5.68e-23 19 0.261 0.312 One Samp… two.sided
#> 8 0.478 -38.1 2.12e-19 19 0.449 0.507 One Samp… two.sided
#>
#> [[3]]
#>
#> Wilcoxon rank sum test with continuity correction
#>
#> data: Sampe_l2d and distances
#> W = 3800, p-value = 2.005e-13
#> alternative hypothesis: true location shift is not equal to 0
C:7bHOv5c3865012065-intro.R
if (requireNamespace("treemapify", quietly = TRUE)) {
IS_treemap(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
}C:7bHOv5c3865012065-intro.R
Region_data <- Count_regions(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
head(Region_data)
#> $Sample_A
#> Product Share Percentage Label Time
#> 1 Exonic 0.0334438084 3.3 Exonic 3m
#> 2 Intronic 0.3711961434 37.1 Intronic 3m
#> 3 Integenic 0.5953600482 59.5 Integenic 3m
#> 4 Enhancer 0.0006025911 0.1 Enhancer 3m
#> 5 Promotor 0.0000000000 0.0 Promotor 3m
#> 6 SafeHarbor 0.0370593552 3.7 SafeHarbor 3m
#>
#> $Sample_B
#> Product Share Percentage Label Time
#> 1 Exonic 0.03359328 3.4 Exonic 12m
#> 2 Intronic 0.36142771 36.1 Intronic 12m
#> 3 Integenic 0.60497900 60.5 Integenic 12m
#> 4 Enhancer 0.00269946 0.3 Enhancer 12m
#> 5 Promotor 0.00000000 0.0 Promotor 12m
#> 6 SafeHarbor 0.03839232 3.8 SafeHarbor 12m
#>
#> $Sample_C
#> Product Share Percentage Label Time
#> 1 Exonic 0.031968927 3.2 Exonic 24m
#> 2 Intronic 0.387212429 38.7 Intronic 24m
#> 3 Integenic 0.580818644 58.1 Integenic 24m
#> 4 Enhancer 0.002091425 0.2 Enhancer 24m
#> 5 Promotor 0.000000000 0.0 Promotor 24m
#> 6 SafeHarbor 0.037048103 3.7 SafeHarbor 24m
plot_regions(Region_data = Region_data)C:7bHOv5c3865012065-intro.R