Introduction

suppressPackageStartupMessages(library(Biostrings))
#> Warning: package 'IRanges' was built under R version 4.3.1
#> Warning: package 'GenomeInfoDb' was built under R version 4.3.1
library(cubar)
library(ggplot2)

Common analyses

CDS sequence QC and basic manipulation.

# example data
yeast_cds
#> DNAStringSet object of length 6600:
#>        width seq                                            names               
#>    [1]   471 ATGAGTTCCCGGTTTGCAAGAA...GATGTGGATATGGATGCGTAA YPL071C
#>    [2]   432 ATGTCTAGATCTGGTGTTGCTG...AGAGGCGCTGGTTCTCATTAA YLL050C
#>    [3]  2160 ATGTCTGGAATGGGTATTGCGA...GAGAGCCTTGCTGGAATATAG YMR172W
#>    [4]   663 ATGTCAGCACCTGCTCAAAACA...GAAGACGATGCTGATTTATAA YOR185C
#>    [5]  2478 ATGGATAACTTCAAAATTTACA...TATCAAAATGGCAGAAAATGA YLL032C
#>    ...   ... ...
#> [6596]  1902 ATGCCAGACAATCTATCATTAC...CACGAAAAGACTTTCATTTAA YBR021W
#> [6597]   138 ATGAGGGTTCTCCATGTTATGC...AAAAAAAAAAAAAAAAGATGA YDR320W-B
#> [6598]   360 ATGTTTATTCTAGCAGAGGTTT...AATGCCGCGCTGGACGATTAA YBR232C
#> [6599]  1704 ATGGCAAGCGAACAGTCCTCAC...TTCCCAAAGAGTTTTAATTGA YDL245C
#> [6600]   906 ATGTTGAATAGTTCAAGAAAAT...TACTCTTTTATCTTCAATTGA YBR024W
yeast_cds_qc <- check_cds(yeast_cds)

# convert a CDS to codon sequence
seq_to_codons(yeast_cds_qc[['YDR320W-B']])
#>  [1] "AGG" "GTT" "CTC" "CAT" "GTT" "ATG" "CTT" "TCT" "TTC" "CTA" "AAC" "TCA"
#> [13] "CTT" "CTT" "TTC" "CTC" "CCT" "ATC" "TGC" "TTT" "TGT" "TTA" "TTA" "CAG"
#> [25] "TTG" "AAG" "GCT" "ACT" "TGT" "GCC" "GTT" "CGT" "GTG" "AAA" "AAA" "TAC"
#> [37] "TCG" "ATG" "AAA" "AAA" "AAA" "AAA" "AAA" "AGA"

# convert a CDS to amino acid sequence
Biostrings::translate(yeast_cds_qc[['YDR320W-B']])
#> 44-letter AAString object
#> seq: RVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR

# get codon frequency
yeast_cf <- count_codons(yeast_cds_qc)

Get codon table and visualize

# get codon table for the standard genetic code
ctab <- get_codon_table(gcid = '1')

# plot possible codon and anticodon pairings
plot_ca_pairing(ctab)

Calculate effective number of codons (ENC)

# get enc
enc <- get_enc(yeast_cf)
head(enc)
#>  YPL071C  YLL050C  YMR172W  YOR185C  YLL032C  YBR225W 
#> 53.00343 45.06356 56.01914 50.84984 53.29440 53.82957

plot_dist <- function(x, xlab = 'values'){
    x <- stack(x)
    ggplot(x, aes(x = values)) +
        geom_histogram() +
        labs(x = xlab, y = 'Number of genes')
}

plot_dist(enc, 'ENC')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate fraction of optimal codons (Fop)

# estimate optimal codons
optimal_codons <- est_optimal_codons(yeast_cds_qc, codon_table = ctab)
head(optimal_codons)
#>    aa_code amino_acid codon subfam anticodon        coef      pvalue
#> 1:       A        Ala   GCT Ala_GC       AGC -0.08702058 0.00000e+00
#> 2:       A        Ala   GCC Ala_GC       GGC -0.01876569 2.16499e-40
#> 3:       A        Ala   GCA Ala_GC       TGC  0.08612405 0.00000e+00
#> 4:       A        Ala   GCG Ala_GC       CGC  0.13245286 0.00000e+00
#> 5:       R        Arg   AGA Arg_AG       TCT -0.13023392 0.00000e+00
#> 6:       R        Arg   AGG Arg_AG       CCT  0.13023392 0.00000e+00
#>         qvalue
#> 1: 0.00000e+00
#> 2: 2.40117e-40
#> 3: 0.00000e+00
#> 4: 0.00000e+00
#> 5: 0.00000e+00
#> 6: 0.00000e+00

# get fop
fop <- get_fop(yeast_cds)
#> Warning in `[.data.table`(bingreg, , `:=`(c("codon_b1", "codon_b2", "codon_b3",
#> : Column 'codon_b1' does not exist to remove
#> Warning in `[.data.table`(bingreg, , `:=`(c("codon_b1", "codon_b2", "codon_b3",
#> : Column 'codon_b2' does not exist to remove
#> Warning in `[.data.table`(bingreg, , `:=`(c("codon_b1", "codon_b2", "codon_b3",
#> : Column 'codon_b3' does not exist to remove
plot_dist(fop, 'Fop')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate Codon Adaptation Index (CAI)

# estimate RSCU of highly expressed genes
yeast_heg <- head(yeast_exp[order(-yeast_exp$fpkm), ], n = 500)
yeast_heg <- yeast_heg[yeast_heg$gene_id %in% rownames(yeast_cf), ]
rscu_heg <- est_rscu(yeast_cf[yeast_heg$gene_id, ], codon_table = ctab)

# calculate CAI of all genes
# note: CAI values are usually calculated based RSCU of highly expressed genes.
cai <- get_cai(yeast_cf, rscu = rscu_heg)
plot_dist(cai, xlab = 'CAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate tRNA Adaptation Index (tAI)

# get tRNA gene copy number from GtRNADB
path_gtrnadb <- 'http://gtrnadb.ucsc.edu/genomes/eukaryota/Scere3/sacCer3-mature-tRNAs.fa'
yeast_trna <- Biostrings::readRNAStringSet(path_gtrnadb)
trna_gcn <- table(data.table::tstrsplit(sub(' .*', '', names(yeast_trna)), '-')[[3]])
trna_gcn <- trna_gcn[names(trna_gcn) != 'NNN'] # copy of each anticodon

# calculate tRNA weight for each codon
trna_w <- est_trna_weight(trna_level = trna_gcn, codon_table = ctab)

# get tAI
tai <- get_tai(yeast_cf, trna_w = trna_w)
plot_dist(tai, 'tAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.