suppressPackageStartupMessages(library(Biostrings))
#> Warning: package 'IRanges' was built under R version 4.3.1
#> Warning: package 'GenomeInfoDb' was built under R version 4.3.1
library(cubar)
library(ggplot2)
CDS sequence QC and basic manipulation.
# example data
yeast_cds
#> DNAStringSet object of length 6600:
#> width seq names
#> [1] 471 ATGAGTTCCCGGTTTGCAAGAA...GATGTGGATATGGATGCGTAA YPL071C
#> [2] 432 ATGTCTAGATCTGGTGTTGCTG...AGAGGCGCTGGTTCTCATTAA YLL050C
#> [3] 2160 ATGTCTGGAATGGGTATTGCGA...GAGAGCCTTGCTGGAATATAG YMR172W
#> [4] 663 ATGTCAGCACCTGCTCAAAACA...GAAGACGATGCTGATTTATAA YOR185C
#> [5] 2478 ATGGATAACTTCAAAATTTACA...TATCAAAATGGCAGAAAATGA YLL032C
#> ... ... ...
#> [6596] 1902 ATGCCAGACAATCTATCATTAC...CACGAAAAGACTTTCATTTAA YBR021W
#> [6597] 138 ATGAGGGTTCTCCATGTTATGC...AAAAAAAAAAAAAAAAGATGA YDR320W-B
#> [6598] 360 ATGTTTATTCTAGCAGAGGTTT...AATGCCGCGCTGGACGATTAA YBR232C
#> [6599] 1704 ATGGCAAGCGAACAGTCCTCAC...TTCCCAAAGAGTTTTAATTGA YDL245C
#> [6600] 906 ATGTTGAATAGTTCAAGAAAAT...TACTCTTTTATCTTCAATTGA YBR024W
yeast_cds_qc <- check_cds(yeast_cds)
# convert a CDS to codon sequence
seq_to_codons(yeast_cds_qc[['YDR320W-B']])
#> [1] "AGG" "GTT" "CTC" "CAT" "GTT" "ATG" "CTT" "TCT" "TTC" "CTA" "AAC" "TCA"
#> [13] "CTT" "CTT" "TTC" "CTC" "CCT" "ATC" "TGC" "TTT" "TGT" "TTA" "TTA" "CAG"
#> [25] "TTG" "AAG" "GCT" "ACT" "TGT" "GCC" "GTT" "CGT" "GTG" "AAA" "AAA" "TAC"
#> [37] "TCG" "ATG" "AAA" "AAA" "AAA" "AAA" "AAA" "AGA"
# convert a CDS to amino acid sequence
Biostrings::translate(yeast_cds_qc[['YDR320W-B']])
#> 44-letter AAString object
#> seq: RVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR
# get codon frequency
yeast_cf <- count_codons(yeast_cds_qc)
Get codon table and visualize
# get codon table for the standard genetic code
ctab <- get_codon_table(gcid = '1')
# plot possible codon and anticodon pairings
plot_ca_pairing(ctab)
Calculate effective number of codons (ENC)
# get enc
enc <- get_enc(yeast_cf)
head(enc)
#> YPL071C YLL050C YMR172W YOR185C YLL032C YBR225W
#> 53.00343 45.06356 56.01914 50.84984 53.29440 53.82957
plot_dist <- function(x, xlab = 'values'){
x <- stack(x)
ggplot(x, aes(x = values)) +
geom_histogram() +
labs(x = xlab, y = 'Number of genes')
}
plot_dist(enc, 'ENC')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Calculate fraction of optimal codons (Fop)
# estimate optimal codons
optimal_codons <- est_optimal_codons(yeast_cds_qc, codon_table = ctab)
head(optimal_codons)
#> aa_code amino_acid codon subfam anticodon coef pvalue
#> 1: A Ala GCT Ala_GC AGC -0.08702058 0.00000e+00
#> 2: A Ala GCC Ala_GC GGC -0.01876569 2.16499e-40
#> 3: A Ala GCA Ala_GC TGC 0.08612405 0.00000e+00
#> 4: A Ala GCG Ala_GC CGC 0.13245286 0.00000e+00
#> 5: R Arg AGA Arg_AG TCT -0.13023392 0.00000e+00
#> 6: R Arg AGG Arg_AG CCT 0.13023392 0.00000e+00
#> qvalue
#> 1: 0.00000e+00
#> 2: 2.40117e-40
#> 3: 0.00000e+00
#> 4: 0.00000e+00
#> 5: 0.00000e+00
#> 6: 0.00000e+00
# get fop
fop <- get_fop(yeast_cds)
#> Warning in `[.data.table`(bingreg, , `:=`(c("codon_b1", "codon_b2", "codon_b3",
#> : Column 'codon_b1' does not exist to remove
#> Warning in `[.data.table`(bingreg, , `:=`(c("codon_b1", "codon_b2", "codon_b3",
#> : Column 'codon_b2' does not exist to remove
#> Warning in `[.data.table`(bingreg, , `:=`(c("codon_b1", "codon_b2", "codon_b3",
#> : Column 'codon_b3' does not exist to remove
plot_dist(fop, 'Fop')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Calculate Codon Adaptation Index (CAI)
# estimate RSCU of highly expressed genes
yeast_heg <- head(yeast_exp[order(-yeast_exp$fpkm), ], n = 500)
yeast_heg <- yeast_heg[yeast_heg$gene_id %in% rownames(yeast_cf), ]
rscu_heg <- est_rscu(yeast_cf[yeast_heg$gene_id, ], codon_table = ctab)
# calculate CAI of all genes
# note: CAI values are usually calculated based RSCU of highly expressed genes.
cai <- get_cai(yeast_cf, rscu = rscu_heg)
plot_dist(cai, xlab = 'CAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Calculate tRNA Adaptation Index (tAI)
# get tRNA gene copy number from GtRNADB
path_gtrnadb <- 'http://gtrnadb.ucsc.edu/genomes/eukaryota/Scere3/sacCer3-mature-tRNAs.fa'
yeast_trna <- Biostrings::readRNAStringSet(path_gtrnadb)
trna_gcn <- table(data.table::tstrsplit(sub(' .*', '', names(yeast_trna)), '-')[[3]])
trna_gcn <- trna_gcn[names(trna_gcn) != 'NNN'] # copy of each anticodon
# calculate tRNA weight for each codon
trna_w <- est_trna_weight(trna_level = trna_gcn, codon_table = ctab)
# get tAI
tai <- get_tai(yeast_cf, trna_w = trna_w)
plot_dist(tai, 'tAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.