An introduction to the bambu package using NanoporeRNASeq data

Introduction

NanoporeRNASeq contains RNA-Seq data from the K562 and MCF7 cell lines that were generated by the SG-NEx project (https://github.com/GoekeLab/sg-nex-data). Each of these cell line has three replicates, with 1 direct RNA sequencing data and 2 cDNA sequencing data. The files contains reads aligned to the human genome (Grch38) chromosome 22 (1:25500000).

Accessing NanoporeRNASeq data

Load the NanoporeRNASeq package

library("NanoporeRNASeq")

List the samples

data("SGNexSamples")
SGNexSamples
##> DataFrame with 6 rows and 6 columns
##>                sample_id    Platform    cellLine    protocol cancer_type
##>              <character> <character> <character> <character> <character>
##> 1 K562_directcDNA_repl..      MinION        K562  directcDNA   Leukocyte
##> 2 K562_directcDNA_repl..     GridION        K562  directcDNA   Leukocyte
##> 3 K562_directRNA_repli..     GridION        K562   directRNA   Leukocyte
##> 4 MCF7_directcDNA_repl..      MinION        MCF7  directcDNA      Breast
##> 5 MCF7_directcDNA_repl..     GridION        MCF7  directcDNA      Breast
##> 6 MCF7_directRNA_repli..     GridION        MCF7   directRNA      Breast
##>                fileNames
##>              <character>
##> 1 NanoporeRNASeq/versi..
##> 2 NanoporeRNASeq/versi..
##> 3 NanoporeRNASeq/versi..
##> 4 NanoporeRNASeq/versi..
##> 5 NanoporeRNASeq/versi..
##> 6 NanoporeRNASeq/versi..

List the available BamFile

library(ExperimentHub)
NanoporeData <- query(ExperimentHub(), c("NanoporeRNA", "GRCh38", "Bam"))
bamFiles <- Rsamtools::BamFileList(NanoporeData[["EH3808"]], NanoporeData[["EH3809"]],
    NanoporeData[["EH3810"]], NanoporeData[["EH3811"]], NanoporeData[["EH3812"]],
    NanoporeData[["EH3813"]])

Get the annotation GRangesList

data("HsChr22BambuAnnotation")
HsChr22BambuAnnotation
##> GRangesList object of length 1500:
##> $ENST00000043402
##> GRanges object with 2 ranges and 2 metadata columns:
##>       seqnames            ranges strand | exon_rank exon_endRank
##>          <Rle>         <IRanges>  <Rle> | <integer>    <integer>
##>   [1]       22 20241415-20243110      - |         2            1
##>   [2]       22 20268071-20268531      - |         1            2
##>   -------
##>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
##> 
##> $ENST00000086933
##> GRanges object with 3 ranges and 2 metadata columns:
##>       seqnames            ranges strand | exon_rank exon_endRank
##>          <Rle>         <IRanges>  <Rle> | <integer>    <integer>
##>   [1]       22 19148576-19149095      - |         3            1
##>   [2]       22 19149663-19149916      - |         2            2
##>   [3]       22 19150025-19150283      - |         1            3
##>   -------
##>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
##> 
##> $ENST00000155674
##> GRanges object with 8 ranges and 2 metadata columns:
##>       seqnames            ranges strand | exon_rank exon_endRank
##>          <Rle>         <IRanges>  <Rle> | <integer>    <integer>
##>   [1]       22 17137511-17138357      - |         8            1
##>   [2]       22 17138550-17138738      - |         7            2
##>   [3]       22 17141059-17141233      - |         6            3
##>   [4]       22 17143098-17143131      - |         5            4
##>   [5]       22 17145024-17145117      - |         4            5
##>   [6]       22 17148448-17148560      - |         3            6
##>   [7]       22 17149542-17149745      - |         2            7
##>   [8]       22 17165209-17165287      - |         1            8
##>   -------
##>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
##> 
##> ...
##> <1497 more elements>

Visualizing gene of interest from a single bam file

We can visualize the one sample for a single gene ENST00000215832 (MAPK1)

library(ggbio)
range <- HsChr22BambuAnnotation$ENST00000215832
# plot mismatch track
library(BSgenome.Hsapiens.NCBI.GRCh38)
# plot annotation track
tx <- autoplot(range, aes(col = strand), group.selfish = TRUE)
# plot coverage track
coverage <- autoplot(bamFiles[[1]], aes(col = coverage), which = range)

# merge the tracks into one plot
tracks(annotation = tx, coverage = coverage, heights = c(1, 3)) + theme_minimal()

Running Bambu with NanoporeRNASeq data

Load the bambu package

library(bambu)
genomeSequenceData <- query(ExperimentHub(), c("NanoporeRNA", "GRCh38", "FASTA"))
genomeSequence <- genomeSequenceData[["EH7260"]]

Run bambu

Applying bambu to bamFiles

se <- bambu(reads = bamFiles, annotations = HsChr22BambuAnnotation, genome = genomeSequence)

bambu returns a SummarizedExperiment object

se
##> class: RangedSummarizedExperiment 
##> dim: 1542 6 
##> metadata(2): incompatibleCounts warnings
##> assays(4): counts CPM fullLengthCounts uniqueCounts
##> rownames(1542): BambuTx1 BambuTx2 ... ENST00000641933 ENST00000641967
##> rowData names(11): TXNAME GENEID ... txid eqClassById
##> colnames(6): 17c3bd19552d0f_3844 17c3bd7ee4641b_3846 ...
##>   17c3bd3686d5e7_3852 17c3bd6e8dd009_3854
##> colData names(1): name

Visualizing gene examples

We can visualize the annotated and novel isoforms identified in this gene example using plot functions from bambu

plotBambu(se, type = "annotation", gene_id = "ENSG00000099968")

##> [[1]]
##> TableGrob (3 x 1) "arrange": 3 grobs
##>   z     cells    name                grob
##> 1 1 (2-2,1-1) arrange      gtable[layout]
##> 2 2 (3-3,1-1) arrange      gtable[layout]
##> 3 3 (1-1,1-1) arrange text[GRID.text.245]
sessionInfo()
##> R Under development (unstable) (2024-10-21 r87258)
##> Platform: x86_64-pc-linux-gnu
##> Running under: Ubuntu 24.04.1 LTS
##> 
##> Matrix products: default
##> BLAS:   /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so 
##> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
##> 
##> locale:
##>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##>  [3] LC_TIME=en_GB              LC_COLLATE=C              
##>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
##> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
##> 
##> time zone: America/New_York
##> tzcode source: system (glibc)
##> 
##> attached base packages:
##> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
##> [8] base     
##> 
##> other attached packages:
##>  [1] bambu_3.9.0                           
##>  [2] SummarizedExperiment_1.37.0           
##>  [3] Biobase_2.67.0                        
##>  [4] MatrixGenerics_1.19.0                 
##>  [5] matrixStats_1.4.1                     
##>  [6] BSgenome.Hsapiens.NCBI.GRCh38_1.3.1000
##>  [7] BSgenome_1.75.0                       
##>  [8] rtracklayer_1.67.0                    
##>  [9] BiocIO_1.17.0                         
##> [10] ggbio_1.55.0                          
##> [11] ggplot2_3.5.1                         
##> [12] Rsamtools_2.23.0                      
##> [13] Biostrings_2.75.0                     
##> [14] XVector_0.47.0                        
##> [15] GenomicRanges_1.59.0                  
##> [16] GenomeInfoDb_1.43.0                   
##> [17] IRanges_2.41.0                        
##> [18] S4Vectors_0.45.0                      
##> [19] NanoporeRNASeq_1.17.0                 
##> [20] ExperimentHub_2.15.0                  
##> [21] AnnotationHub_3.15.0                  
##> [22] BiocFileCache_2.15.0                  
##> [23] dbplyr_2.5.0                          
##> [24] BiocGenerics_0.53.1                   
##> [25] generics_0.1.3                        
##> 
##> loaded via a namespace (and not attached):
##>   [1] RColorBrewer_1.1-3       rstudioapi_0.17.1        jsonlite_1.8.9          
##>   [4] magrittr_2.0.3           GenomicFeatures_1.59.0   farver_2.1.2            
##>   [7] rmarkdown_2.29           zlibbioc_1.53.0          vctrs_0.6.5             
##>  [10] memoise_2.0.1            RCurl_1.98-1.16          base64enc_0.1-3         
##>  [13] progress_1.2.3           htmltools_0.5.8.1        S4Arrays_1.7.1          
##>  [16] curl_5.2.3               xgboost_1.7.8.1          SparseArray_1.7.0       
##>  [19] Formula_1.2-5            sass_0.4.9               bslib_0.8.0             
##>  [22] htmlwidgets_1.6.4        httr2_1.0.6              plyr_1.8.9              
##>  [25] cachem_1.1.0             GenomicAlignments_1.43.0 mime_0.12               
##>  [28] lifecycle_1.0.4          pkgconfig_2.0.3          Matrix_1.7-1            
##>  [31] R6_2.5.1                 fastmap_1.2.0            GenomeInfoDbData_1.2.13 
##>  [34] digest_0.6.37            colorspace_2.1-1         GGally_2.2.1            
##>  [37] AnnotationDbi_1.69.0     OrganismDbi_1.49.0       Hmisc_5.2-0             
##>  [40] RSQLite_2.3.7            labeling_0.4.3           filelock_1.0.3          
##>  [43] fansi_1.0.6              httr_1.4.7               abind_1.4-8             
##>  [46] compiler_4.5.0           bit64_4.5.2              withr_3.0.2             
##>  [49] htmlTable_2.4.3          backports_1.5.0          BiocParallel_1.41.0     
##>  [52] DBI_1.2.3                ggstats_0.7.0            highr_0.11              
##>  [55] biomaRt_2.63.0           rappdirs_0.3.3           DelayedArray_0.33.1     
##>  [58] rjson_0.2.23             tools_4.5.0              foreign_0.8-87          
##>  [61] nnet_7.3-19              glue_1.8.0               restfulr_0.0.15         
##>  [64] grid_4.5.0               checkmate_2.3.2          cluster_2.1.6           
##>  [67] reshape2_1.4.4           gtable_0.3.6             tidyr_1.3.1             
##>  [70] ensembldb_2.31.0         hms_1.1.3                data.table_1.16.2       
##>  [73] xml2_1.3.6               utf8_1.2.4               BiocVersion_3.21.1      
##>  [76] pillar_1.9.0             stringr_1.5.1            dplyr_1.1.4             
##>  [79] lattice_0.22-6           bit_4.5.0                biovizBase_1.55.0       
##>  [82] RBGL_1.83.0              tidyselect_1.2.1         knitr_1.48              
##>  [85] gridExtra_2.3            ProtGenerics_1.39.0      xfun_0.49               
##>  [88] stringi_1.8.4            UCSC.utils_1.3.0         lazyeval_0.2.2          
##>  [91] yaml_2.3.10              evaluate_1.0.1           codetools_0.2-20        
##>  [94] tibble_3.2.1             graph_1.85.0             BiocManager_1.30.25     
##>  [97] cli_3.6.3                rpart_4.1.23             munsell_0.5.1           
##> [100] jquerylib_0.1.4          dichromat_2.0-0.1        Rcpp_1.0.13-1           
##> [103] png_0.1-8                XML_3.99-0.17            parallel_4.5.0          
##> [106] blob_1.2.4               prettyunits_1.2.0        AnnotationFilter_1.31.0 
##> [109] bitops_1.0-9             txdbmaker_1.3.0          VariantAnnotation_1.53.0
##> [112] scales_1.3.0             purrr_1.0.2              crayon_1.5.3            
##> [115] rlang_1.1.4              KEGGREST_1.47.0          formatR_1.14