Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions R/summariseExpression.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#' Summarise transcript expression to exon-level expression
#' @title summarise by exon
#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
#' @return A \code{RangedSummarizedExperiment} with exon-level expression.
#' Rows are named \code{EX1, EX2, ...} in order of first appearance.
#' \code{rowRanges} is a \code{GRanges} with one entry per unique exon locus.
#' \code{mcols} columns:
#' \describe{
#' \item{GENEID}{Comma-separated gene IDs for transcripts sharing the exon}
#' \item{txNames}{Comma-separated transcript IDs that contain this exon}
#' \item{exonClass}{\code{"annotated"} if the exon coordinates exist in the
#' reference annotation, \code{"novel"} otherwise}
#' }
#' @details Counts (and other assays) are summed across all transcripts sharing
#' the same exon (identical seqnames, start, end, strand). CPM is recomputed
#' from the aggregated counts. \code{fullLengthCounts} and
#' \code{uniqueCounts} are summed where present.
#' @importFrom Matrix sparseMatrix
#' @importFrom SummarizedExperiment rowRanges rowData
#' @importFrom GenomicRanges GRanges seqnames start end strand
#' @importFrom IRanges IRanges
#' @importFrom dplyr left_join mutate group_by summarise filter distinct
#' @export
summariseByExon <- function(se) {
exonRanges <- unlist(rowRanges(se), use.names = TRUE)
txNames <- rownames(se)

# Build data.frame of exon-transcript pairs
exonDf <- data.frame(
TXNAME = names(exonRanges),
seqnames = as.character(seqnames(exonRanges)),
start = start(exonRanges),
end = end(exonRanges),
strand = as.character(strand(exonRanges)),
stringsAsFactors = FALSE
) |>
mutate(exon_id = paste(seqnames, start, end, strand, sep = ":"))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

%>% group_by(...) + create index


# Attach GENEID and txClassDescription from rowData
geneDf <- as.data.frame(rowData(se))[, c("GENEID", "txClassDescription"), drop = FALSE]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

novelTranscripts should be there

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't TXNAME already in rowData column?

geneDf$TXNAME <- rownames(se)
exonDf <- left_join(exonDf, geneDf, by = "TXNAME")

# Collapse coordinates and GENEID/txNames per unique exon
exonMeta <- exonDf |>
group_by(exon_id) |>
summarise(
seqnames = seqnames[1],
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in fact, if exon_id can be used, why not group_by exon_id, seqnames, start, end and strand? because they would just be shared for the same exon_id

start = start[1],
end = end[1],
strand = strand[1],
GENEID = paste(sort(unique(GENEID)), collapse = ","),
txNames = paste(sort(unique(TXNAME)), collapse = ","),
.groups = "drop"
)

# exonClass: "annotated" if the exon coordinates (seqnames, start, end, strand)
# exist in the reference annotation, "novel" otherwise.
annotatedCoords <- exonDf |>
filter(txClassDescription == "annotation") |>
distinct(seqnames, start, end, strand) |>
mutate(exonClass = "annotated")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use novelTranscript as part of summarise above

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use all()


exonMeta <- exonMeta |>
left_join(annotatedCoords, by = c("seqnames", "start", "end", "strand")) |>
mutate(exonClass = ifelse(is.na(exonClass), "novel", "annotated"))

# Sparse binary matrix: unique_exons x transcripts
uniqueExons <- exonMeta$exon_id
exonNames <- paste0("EX", seq_along(uniqueExons))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

generalise by groupID

aggregationMat <- sparseMatrix(
i = match(exonDf$exon_id, uniqueExons),
j = match(exonDf$TXNAME, txNames),
x = 1L,
dims = c(length(uniqueExons), length(txNames)),
dimnames = list(exonNames, txNames)
)

# Build output GRanges
outRanges <- GRanges(
seqnames = exonMeta$seqnames,
ranges = IRanges(start = exonMeta$start, end = exonMeta$end),
strand = exonMeta$strand
)
names(outRanges) <- exonNames
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

groupNames

mcols(outRanges)$GENEID <- exonMeta$GENEID
mcols(outRanges)$txNames <- exonMeta$txNames
mcols(outRanges)$exonClass <- exonMeta$exonClass

.aggregateAssays(se, aggregationMat, outRanges)
Copy link
Copy Markdown
Collaborator

@cying111 cying111 Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is this function loaded from? there is a same function from library(SingleCellExperiments), but I am not sure whether it does the same thing, probably not (gemini suggested code for this function)

Assuming 'se' is your SingleCellExperiment object
and 'cluster_id' is a column in colData(se)
library(SingleCellExperiment)
aggregated_se <- .aggregateAssays(se,
grouping = colData(se)$cluster_id,
fun = "mean")

trying to find it, but not able to find it anywhere

I think maybe ok to leave it, looks very different in terms of usage

}
31 changes: 31 additions & 0 deletions R/summariseExpression_utility.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Core aggregation engine shared by summariseByExon (and any future summariseBy* functions).
# aggregationMat : sparse matrix (output_features x input_transcripts)
# outRanges : GRanges / GRangesList for the output rows
#' @importFrom SummarizedExperiment assays colData SummarizedExperiment
#' @noRd
.aggregateAssays <- function(se, aggregationMat, outRanges) {
counts <- aggregationMat %*% assays(se)$counts

counts.total <- colSums(counts)
counts.total[counts.total == 0] <- 1
cpm <- counts / counts.total * 10^6

outAssays <- SimpleList(counts = counts, CPM = cpm)

for (nm in c("fullLengthCounts", "uniqueCounts")) {
if (nm %in% names(assays(se))) {
outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]]
}
}

ColNames <- colnames(counts)
ColData <- colData(se)
ColData@rownames <- ColNames
ColData@listData$name <- ColNames

SummarizedExperiment(
assays = outAssays,
rowRanges = outRanges,
colData = ColData
)
}
Loading