Introduction to LISAT

Step 1: Data Preparation

We will generate simulated chromosome integration site (IS) raw data to demonstrate the functionality of the package.

set.seed(12345)

n_rows <- 10000
sample_names <- c("Sample_A", "Sample_B", "Sample_C")
chr_list <- paste0(1:23)

# Generate random data
Sample <- sample(sample_names, size = n_rows, replace = TRUE)
SCount <- sample(1:1000, size = n_rows, replace = TRUE)
Chr <- sample(chr_list, size = n_rows, replace = TRUE)
Locus <- sample(1:150000000, size = n_rows, replace = TRUE)

IS_raw <- data.frame(
  Sample = Sample,
  SCount = SCount,
  Chr = Chr,
  Locus = Locus,
  stringsAsFactors = FALSE
)

# Simulate some high-frequency clones (updated)
IS_raw$SCount[1:100] <- sample(500000:800000, 100, replace = TRUE)

head(IS_raw)
#>     Sample SCount Chr     Locus
#> 1 Sample_B 518557   1 113660784
#> 2 Sample_C 538402   2  74275020
#> 3 Sample_B 686867  23 149373139
#> 4 Sample_B 750400  21  34406576
#> 5 Sample_A 525756   2  39308499
#> 6 Sample_C 695199  21 129455667

C:7bHOv5c3865012065-intro.R

Validate the data structure:

check_validity <- validate_IS_raw(IS_raw)

C:7bHOv5c3865012065-intro.R

Create a patient-timepoint mapping table for longitudinal analysis.

Patient_timepoint <- data.frame(
  Sample_ID = c("Sample_A", "Sample_B", "Sample_C"),
  Time_Point = c("3m", "12m", "24m"),
  Patient_ID = rep("Pt1", 3),
  stringsAsFactors = FALSE
)

head(Patient_timepoint)
#>   Sample_ID Time_Point Patient_ID
#> 1  Sample_A         3m        Pt1
#> 2  Sample_B        12m        Pt1
#> 3  Sample_C        24m        Pt1

C:7bHOv5c3865012065-intro.R

Step 2: Genomic Feature Annotation

Annotate the integration sites with genomic features. Note: This step requires TxDb.Hsapiens.UCSC.hg38.knownGene and org.Hs.eg.db packages.

if (requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE) &&
    requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
    
  IS_raw <- get_feature(IS_raw)
  
  # Check for overlap with specific genomic elements
  IS_raw <- Enhancer_check(IS_raw)
  IS_raw <- Promotor_check(IS_raw)
  IS_raw <- Safeharbor_check(IS_raw)
  
  names(IS_raw)
} else {
  message("Skipping annotation: Required annotation packages not installed.")
}
#>  [1] "Sample"             "SCount"             "Chr"               
#>  [4] "Locus"              "Clone_contribution" "in_gene"           
#>  [7] "in_exon"            "in_intron"          "nearest_distance"  
#> [10] "start"              "end"                "width"             
#> [13] "strand"             "nearest_gene_name"  "Enhancer"          
#> [16] "Promotor"           "Safeharbor"

C:7bHOv5c3865012065-intro.R

Step 3: Integration Site Analysis

Common Integration Sites (CIS)

Identify regions with recurrent integration sites.

CIS_top <- CIS(IS_raw = IS_raw, connect_distance = 50000)
#> CIS_network calculates interactions between integration sites in a network view
CIS_by_sample <- CIS_overlap(CIS_data = CIS_top, IS_raw = IS_raw)
CIS_by_sample

Chr	Locus	Gene	Total_dots	Central_connectivity	Dimention	Order	Gene_network	Sample_A	Sample_B	Sample_C
chr14	46622339	RPL10L	5	3	23715	10	RPL10L	FALSE	TRUE	TRUE
chr6	119446557	LOC105377975	4	3	29464	7	LOC105377975	TRUE	TRUE	FALSE
chr7	49323212	LOC124901804	4	3	30996	7	LOC124901804	TRUE	TRUE	TRUE
chr20	38686365	SLC32A1	4	3	33175	7	SLC32A1; ARHGAP40	FALSE	TRUE	TRUE
chr11	49305065	FOLH1	4	2	32606	6	FOLH1	TRUE	TRUE	TRUE
chr17	96254683	ZNF750	4	2	32661	6	ZNF750	FALSE	TRUE	TRUE
chr12	136321446	ANHX	4	1	51276	5	ANHX	FALSE	TRUE	TRUE
chr8	4382729	CSMD1	4	2	34476	5	CSMD1	TRUE	FALSE	TRUE
chr4	102665076	MANBA	4	2	41393	5	MANBA	TRUE	TRUE	TRUE
chr19	117947388	CENPBD2P	3	2	2379	4	CENPBD2P	TRUE	TRUE	TRUE

C:7bHOv5c3865012065-intro.R

Chromosome Distribution

chr_stats <- chr_distribution(IS_raw)
print(chr_stats)

C:7bHOv5c3865012065-intro.R

Gene Set Overlap

if (requireNamespace("TxDb.Hsapiens.UCSC.hg38.knownGene", quietly = TRUE) &&
    requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
  # Adverse Event genes
  ae_overlap <- is_in_AE_gene(IS_raw = IS_raw, Distance = 100000)
  
  # Cancer Genes
  cg_overlap <- is_in_CG_gene(IS_raw = IS_raw, threashold = 0.001)
  print(cg_overlap)
  
  # Immune Genes
  immune_overlap <- is_in_immune_gene(IS_raw = IS_raw, threashold = 0.001)
}

C:7bHOv5c3865012065-intro.R

Step 4: Longitudinal Analysis (PMD)

Perform Population Matching Distribution (PMD) analysis.

PMD_data <- pmd_analysis(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
head(PMD_data)
#>           UIS    TOP_P Richness  Eveness        PMD   Sample Time
#> Sample_A 3319 4.126260 11.69653 4.599022 -0.6259863 Sample_A   3m
#> Sample_B 3334 3.217942 11.70304 4.957718 -0.4442108 Sample_B  12m
#> Sample_C 3347 3.079673 11.70865 5.021079 -0.4134854 Sample_C  24m
# Plot Richness and Evenness
plot_richness_evenness(PMD_data = PMD_data)


# Analyze linked timepoints
linked_data <- Linked_timepoints(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
print(linked_data)

C:7bHOv5c3865012065-intro.R

Step 5: Clonal Dominance Analysis

Analyze potential dominant clones using cumulative distribution.

IS_ratio <- fit_cum_simple(IS_raw$SCount)
print(Cumulative_curve(IS_ratio)) # Function for plotting
#> [[1]]

#> 
#> [[2]]
#> # A tibble: 8 × 8
#>   estimate statistic  p.value parameter conf.low conf.high method    alternative
#>      <dbl>     <dbl>    <dbl>     <dbl>    <dbl>     <dbl> <chr>     <chr>      
#> 1  0.00260    -945.  7.42e-46        19  0.00212   0.00308 One Samp… two.sided  
#> 2  0.0183     -255.  4.93e-35        19  0.0149    0.0216  One Samp… two.sided  
#> 3  0.0319     -239.  1.65e-34        19  0.0266    0.0371  One Samp… two.sided  
#> 4  0.0549     -255.  4.85e-35        19  0.0475    0.0622  One Samp… two.sided  
#> 5  0.0954     -171.  9.20e-32        19  0.0843    0.106   One Samp… two.sided  
#> 6  0.166      -101.  1.92e-27        19  0.149     0.184   One Samp… two.sided  
#> 7  0.286       -58.9 5.68e-23        19  0.261     0.312   One Samp… two.sided  
#> 8  0.478       -38.1 2.12e-19        19  0.449     0.507   One Samp… two.sided  
#> 
#> [[3]]
#> 
#>  Wilcoxon rank sum test with continuity correction
#> 
#> data:  Sampe_l2d and distances
#> W = 3800, p-value = 2.005e-13
#> alternative hypothesis: true location shift is not equal to 0

C:7bHOv5c3865012065-intro.R

Step 6: Visualization

Treemap

if (requireNamespace("treemapify", quietly = TRUE)) {
    IS_treemap(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
}

C:7bHOv5c3865012065-intro.R

Region Counts

Region_data <- Count_regions(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
head(Region_data)
#> $Sample_A
#>      Product        Share Percentage      Label Time
#> 1     Exonic 0.0334438084        3.3     Exonic   3m
#> 2   Intronic 0.3711961434       37.1   Intronic   3m
#> 3  Integenic 0.5953600482       59.5  Integenic   3m
#> 4   Enhancer 0.0006025911        0.1   Enhancer   3m
#> 5   Promotor 0.0000000000        0.0   Promotor   3m
#> 6 SafeHarbor 0.0370593552        3.7 SafeHarbor   3m
#> 
#> $Sample_B
#>      Product      Share Percentage      Label Time
#> 1     Exonic 0.03359328        3.4     Exonic  12m
#> 2   Intronic 0.36142771       36.1   Intronic  12m
#> 3  Integenic 0.60497900       60.5  Integenic  12m
#> 4   Enhancer 0.00269946        0.3   Enhancer  12m
#> 5   Promotor 0.00000000        0.0   Promotor  12m
#> 6 SafeHarbor 0.03839232        3.8 SafeHarbor  12m
#> 
#> $Sample_C
#>      Product       Share Percentage      Label Time
#> 1     Exonic 0.031968927        3.2     Exonic  24m
#> 2   Intronic 0.387212429       38.7   Intronic  24m
#> 3  Integenic 0.580818644       58.1  Integenic  24m
#> 4   Enhancer 0.002091425        0.2   Enhancer  24m
#> 5   Promotor 0.000000000        0.0   Promotor  24m
#> 6 SafeHarbor 0.037048103        3.7 SafeHarbor  24m
plot_regions(Region_data = Region_data)

C:7bHOv5c3865012065-intro.R

Ideogram

# Example usage:
# ideogram_plot(IS_raw, output_dir = tempdir())

C:7bHOv5c3865012065-intro.R

Introduction to LISAT

Introduction

Setup

Step 1: Data Preparation

Step 2: Genomic Feature Annotation

Step 3: Integration Site Analysis

Common Integration Sites (CIS)

Chromosome Distribution

Gene Set Overlap

Step 4: Longitudinal Analysis (PMD)

Step 5: Clonal Dominance Analysis

Step 6: Visualization

Treemap

Region Counts

Ideogram