Introduction to LISAT

Introduction

The lisat (Longitudinal Integration Site Analysis Toolkit) package provides a comprehensive set of tools for the analysis of longitudinal integration site data. This vignette demonstrates the basic workflow, including data simulation, annotation, statistical modeling, and visualization.

Setup

First, load the lisat package.

library(lisat)

C:7bHOv5c3865012065-intro.R

Step 1: Data Preparation

We will generate simulated chromosome integration site (IS) raw data to demonstrate the functionality of the package.

set.seed(12345)

n_rows <- 10000
sample_names <- c("Sample_A", "Sample_B", "Sample_C")
chr_list <- paste0(1:23)

# Generate random data
Sample <- sample(sample_names, size = n_rows, replace = TRUE)
SCount <- sample(1:1000, size = n_rows, replace = TRUE)
Chr <- sample(chr_list, size = n_rows, replace = TRUE)
Locus <- sample(1:150000000, size = n_rows, replace = TRUE)

IS_raw <- data.frame(
  Sample = Sample,
  SCount = SCount,
  Chr = Chr,
  Locus = Locus,
  stringsAsFactors = FALSE
)

# Simulate some high-frequency clones (updated)
IS_raw$SCount[1:100] <- sample(500000:800000, 100, replace = TRUE)

head(IS_raw)
#>     Sample SCount Chr     Locus
#> 1 Sample_B 518557   1 113660784
#> 2 Sample_C 538402   2  74275020
#> 3 Sample_B 686867  23 149373139
#> 4 Sample_B 750400  21  34406576
#> 5 Sample_A 525756   2  39308499
#> 6 Sample_C 695199  21 129455667

C:7bHOv5c3865012065-intro.R

Validate the data structure:

check_validity <- validate_IS_raw(IS_raw)

C:7bHOv5c3865012065-intro.R

Create a patient-timepoint mapping table for longitudinal analysis.

Patient_timepoint <- data.frame(
  Sample_ID = c("Sample_A", "Sample_B", "Sample_C"),
  Time_Point = c("3m", "12m", "24m"),
  Patient_ID = rep("Pt1", 3),
  stringsAsFactors = FALSE
)

head(Patient_timepoint)
#>   Sample_ID Time_Point Patient_ID
#> 1  Sample_A         3m        Pt1
#> 2  Sample_B        12m        Pt1
#> 3  Sample_C        24m        Pt1

C:7bHOv5c3865012065-intro.R

Step 2: Genomic Feature Annotation

Annotate the integration sites with genomic features. Note: This step requires TxDb.Hsapiens.UCSC.hg38.knownGene and org.Hs.eg.db packages.

if (requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE) &&
    requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
    
  IS_raw <- get_feature(IS_raw)
  
  # Check for overlap with specific genomic elements
  IS_raw <- Enhancer_check(IS_raw)
  IS_raw <- Promotor_check(IS_raw)
  IS_raw <- Safeharbor_check(IS_raw)
  
  names(IS_raw)
} else {
  message("Skipping annotation: Required annotation packages not installed.")
}
#>  [1] "Sample"             "SCount"             "Chr"               
#>  [4] "Locus"              "Clone_contribution" "in_gene"           
#>  [7] "in_exon"            "in_intron"          "nearest_distance"  
#> [10] "start"              "end"                "width"             
#> [13] "strand"             "nearest_gene_name"  "Enhancer"          
#> [16] "Promotor"           "Safeharbor"

C:7bHOv5c3865012065-intro.R

Step 3: Integration Site Analysis

Common Integration Sites (CIS)

Identify regions with recurrent integration sites.

CIS_top <- CIS(IS_raw = IS_raw, connect_distance = 50000)
#> CIS_network calculates interactions between integration sites in a network view
CIS_by_sample <- CIS_overlap(CIS_data = CIS_top, IS_raw = IS_raw)
CIS_by_sample
Chr Locus Gene Total_dots Central_connectivity Dimention Order Gene_network Sample_A Sample_B Sample_C
chr14 46622339 RPL10L 5 3 23715 10 RPL10L FALSE TRUE TRUE
chr6 119446557 LOC105377975 4 3 29464 7 LOC105377975 TRUE TRUE FALSE
chr7 49323212 LOC124901804 4 3 30996 7 LOC124901804 TRUE TRUE TRUE
chr20 38686365 SLC32A1 4 3 33175 7 SLC32A1; ARHGAP40 FALSE TRUE TRUE
chr11 49305065 FOLH1 4 2 32606 6 FOLH1 TRUE TRUE TRUE
chr17 96254683 ZNF750 4 2 32661 6 ZNF750 FALSE TRUE TRUE
chr12 136321446 ANHX 4 1 51276 5 ANHX FALSE TRUE TRUE
chr8 4382729 CSMD1 4 2 34476 5 CSMD1 TRUE FALSE TRUE
chr4 102665076 MANBA 4 2 41393 5 MANBA TRUE TRUE TRUE
chr19 117947388 CENPBD2P 3 2 2379 4 CENPBD2P TRUE TRUE TRUE

C:7bHOv5c3865012065-intro.R

Chromosome Distribution

chr_stats <- chr_distribution(IS_raw)
print(chr_stats)

C:7bHOv5c3865012065-intro.R

Gene Set Overlap

if (requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE) &&
    requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
  # Adverse Event genes
  ae_overlap <- is_in_AE_gene(IS_raw = IS_raw, Distance = 100000)
  
  # Cancer Genes
  cg_overlap <- is_in_CG_gene(IS_raw = IS_raw, threashold = 0.001)
  print(cg_overlap)
  
  # Immune Genes
  immune_overlap <- is_in_immune_gene(IS_raw = IS_raw, threashold = 0.001)
}

C:7bHOv5c3865012065-intro.R

Step 4: Longitudinal Analysis (PMD)

Perform Population Matching Distribution (PMD) analysis.

PMD_data <- pmd_analysis(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
head(PMD_data)
#>           UIS    TOP_P Richness  Eveness        PMD   Sample Time
#> Sample_A 3319 4.126260 11.69653 4.599022 -0.6259863 Sample_A   3m
#> Sample_B 3334 3.217942 11.70304 4.957718 -0.4442108 Sample_B  12m
#> Sample_C 3347 3.079673 11.70865 5.021079 -0.4134854 Sample_C  24m
# Plot Richness and Evenness
plot_richness_evenness(PMD_data = PMD_data)


# Analyze linked timepoints
linked_data <- Linked_timepoints(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
print(linked_data)

C:7bHOv5c3865012065-intro.R

Step 5: Clonal Dominance Analysis

Analyze potential dominant clones using cumulative distribution.

IS_ratio <- fit_cum_simple(IS_raw$SCount)
print(Cumulative_curve(IS_ratio)) # Function for plotting
#> [[1]]

#> 
#> [[2]]
#> # A tibble: 8 × 8
#>   estimate statistic  p.value parameter conf.low conf.high method    alternative
#>      <dbl>     <dbl>    <dbl>     <dbl>    <dbl>     <dbl> <chr>     <chr>      
#> 1  0.00260    -945.  7.42e-46        19  0.00212   0.00308 One Samp… two.sided  
#> 2  0.0183     -255.  4.93e-35        19  0.0149    0.0216  One Samp… two.sided  
#> 3  0.0319     -239.  1.65e-34        19  0.0266    0.0371  One Samp… two.sided  
#> 4  0.0549     -255.  4.85e-35        19  0.0475    0.0622  One Samp… two.sided  
#> 5  0.0954     -171.  9.20e-32        19  0.0843    0.106   One Samp… two.sided  
#> 6  0.166      -101.  1.92e-27        19  0.149     0.184   One Samp… two.sided  
#> 7  0.286       -58.9 5.68e-23        19  0.261     0.312   One Samp… two.sided  
#> 8  0.478       -38.1 2.12e-19        19  0.449     0.507   One Samp… two.sided  
#> 
#> [[3]]
#> 
#>  Wilcoxon rank sum test with continuity correction
#> 
#> data:  Sampe_l2d and distances
#> W = 3800, p-value = 2.005e-13
#> alternative hypothesis: true location shift is not equal to 0

C:7bHOv5c3865012065-intro.R

Step 6: Visualization

Treemap

if (requireNamespace("treemapify", quietly = TRUE)) {
    IS_treemap(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
}

C:7bHOv5c3865012065-intro.R

Region Counts

Region_data <- Count_regions(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
head(Region_data)
#> $Sample_A
#>      Product        Share Percentage      Label Time
#> 1     Exonic 0.0334438084        3.3     Exonic   3m
#> 2   Intronic 0.3711961434       37.1   Intronic   3m
#> 3  Integenic 0.5953600482       59.5  Integenic   3m
#> 4   Enhancer 0.0006025911        0.1   Enhancer   3m
#> 5   Promotor 0.0000000000        0.0   Promotor   3m
#> 6 SafeHarbor 0.0370593552        3.7 SafeHarbor   3m
#> 
#> $Sample_B
#>      Product      Share Percentage      Label Time
#> 1     Exonic 0.03359328        3.4     Exonic  12m
#> 2   Intronic 0.36142771       36.1   Intronic  12m
#> 3  Integenic 0.60497900       60.5  Integenic  12m
#> 4   Enhancer 0.00269946        0.3   Enhancer  12m
#> 5   Promotor 0.00000000        0.0   Promotor  12m
#> 6 SafeHarbor 0.03839232        3.8 SafeHarbor  12m
#> 
#> $Sample_C
#>      Product       Share Percentage      Label Time
#> 1     Exonic 0.031968927        3.2     Exonic  24m
#> 2   Intronic 0.387212429       38.7   Intronic  24m
#> 3  Integenic 0.580818644       58.1  Integenic  24m
#> 4   Enhancer 0.002091425        0.2   Enhancer  24m
#> 5   Promotor 0.000000000        0.0   Promotor  24m
#> 6 SafeHarbor 0.037048103        3.7 SafeHarbor  24m
plot_regions(Region_data = Region_data)

C:7bHOv5c3865012065-intro.R

Ideogram

# Example usage:
# ideogram_plot(IS_raw, output_dir = tempdir())

C:7bHOv5c3865012065-intro.R

mirror server hosted at Truenetwork, Russian Federation.