Tsunghan Hsieh 2022-08-26
This analysis is aiming for identifying aging-associated genes (biomarkers) in large intestine and visualizing it in Tableau. The data is subset from Tabula muris. The subset data is then filtered and normalized by my python script.
Due to the large size, all input files are not uploaded. All output files for Tableau are saved in output folder.
Packages are required for this analysis.
library(dplyr)
library(tidyr)
library(Matrix)
library(DESeq2)
library(msigdbr)
library(forcats)
library(fgsea)
wd <- getwd()
subset <- "subset"
output <- "output"
input_dir <- file.path(wd,subset)
output_dir <- file.path(wd,output)
method <- "facs"
tissue <- "LargeIntestine"
Functions used in this script
## get pseudobulk for each cell type
getPseudobulk <- function(mat, celltype) {
stopifnot(!missing(mat) || !missing(celltype))
mat.summary <- do.call(cbind, lapply(levels(celltype), function(ct) {
cells <- names(celltype)[celltype==ct]
pseudobulk <- rowSums(mat[, cells])
return(pseudobulk)
}))
colnames(mat.summary) <- levels(celltype)
return(mat.summary)
}
## save to csv
save_csv <- function(df,dir,name) {
stopifnot(!missing(df) || !missing(dir) || !missing(name))
write.csv(df,file.path(dir,name))
}
## calculate row-wise variance
RowVar <- function(x, ...) {
rowSums((x - rowMeans(x, ...))^2, ...)/(dim(x)[2] - 1)
}
## check the differentially-expressed genes
checkDEG <- function(df,padj,log2FC) {
stopifnot(!missing(df) || !missing(padj) || !missing(log2FC))
df <- df %>%
mutate(DEG = ifelse(padj > {{padj}}, "None", ifelse(
log2FoldChange > {{log2FC}}, "Up", ifelse(
log2FoldChange < -{{log2FC}}, "Down", "None"
)
)))
df$symbol <- rownames(df)
return(df)
}
## Get GO objects
get_GO <- function(df,pathway){
stopifnot(!missing(df))
stopifnot(!missing(pathway))
input <- df
input <- input[!is.na(input$log2FoldChange), ]
input <- input %>%
dplyr::arrange(desc(log2FoldChange))
ranks <- setNames(input$log2FoldChange , input$symbol)
fgseaRes <- fgsea(pathways = pathways, stats = ranks, minSize=15, maxSize=500)
fgseaRes$topGenes <- ""
for(k in seq(1:length(fgseaRes$leadingEdge))){
fgseaRes$topGenes[k] <- fgseaRes$leadingEdge[k][[1]] %>% paste(., collapse = ",")
}
return(fgseaRes)
}
## Get GSEA plot
get_fgsea_plot <- function(df){
stopifnot(!missing(df))
df <- df %>%
dplyr::mutate(Regulation = ifelse(NES > 0, "Up", "Down")) %>%
dplyr::mutate(log_padj_mutated = ifelse(NES > 0, -log10(padj), log10(padj)))
df_up <- df %>%
dplyr::arrange(desc(NES)) %>%
slice(1:5)
df_down <- df %>%
dplyr::arrange(NES) %>%
slice(1:5)
df_subset <- rbind(df_up,df_down)
p.fgsea <- ggplot(df_subset,aes(x=reorder(pathway,log_padj_mutated),y=log_padj_mutated,fill=factor(Regulation))) +
geom_bar(stat='identity') + theme(panel.background = element_rect(fill = "white", colour = "black"),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
plot.title = element_text(size = 7),
axis.title.y = element_blank(), axis.title.x = element_blank(),
axis.text= element_text(size = 5)) +
geom_hline(yintercept = -log10(0.05)) +
geom_hline(yintercept = log10(0.05)) +
scale_fill_manual(values=c("blue","red")) +
coord_flip() + labs(y='-log10(padj)',x='pathways')
return(p.fgsea)
}
We read data and transform it to sparse matrix, which can make the computation faster. The file reading is a long process.
annot <- read.csv(file.path(input_dir,paste(method,tissue,paste0("annot",".csv"),sep="_")))
mtx <- read.csv(file.path(input_dir,paste(method,tissue,paste0("mtx",".csv"),sep="_"))) # long process
rownames(mtx) <- mtx$index
mtx <- mtx[,2:ncol(mtx)]
mtx <- as.matrix(t(mtx))
## convert count matrix to sparse matrix for faster computing
mtx.sparse <- Matrix(mtx, sparse = TRUE)
## Check the sparse matrix class
class(mtx.sparse)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
Generate reports for cell proportion dynamics
type_cell_proportion <- "cell_proportion"
cell_sum <- annot %>%
group_by(mouse.id,cell_ontology_class) %>%
summarise(count=n()) %>%
summarise(sum=sum(count))
cell_proportion <- annot %>%
group_by(mouse.id,cell_ontology_class) %>%
summarise(count=n()) %>%
left_join(cell_sum,by="mouse.id") %>%
separate(mouse.id,c("month","id","sex"),sep="_") %>%
mutate(proportion = 100*(count/sum))
save_csv(cell_proportion,output_dir,paste(method,tissue,paste0(type_cell_proportion,".csv"),sep="_"))
Take a look at how many cell types in this tissue.
cell.type.list <- annot %>%
group_by(cell_ontology_class) %>%
summarise(count=n())
## Determine the major and minor cell population
minor_cell_populaiton <- c("Brush cell of epithelium proper of large intestine",
"secretory cell",
"enteroendocrine cell")
major_cell_population <- c("all",
"intestinal crypt stem cell",
"large intestine goblet cell",
"enterocyte of epithelium of large intestine",
"epithelial cell of large intestine")
cell.type.list
## # A tibble: 7 × 2
## cell_ontology_class count
## <chr> <int>
## 1 Brush cell of epithelium proper of large intestine 175
## 2 enterocyte of epithelium of large intestine 2542
## 3 enteroendocrine cell 161
## 4 epithelial cell of large intestine 1704
## 5 intestinal crypt stem cell 1579
## 6 large intestine goblet cell 1587
## 7 secretory cell 563
Take a look at how many mouse id.
mouseid.list <- annot %>%
distinct(mouse.id) %>%
as.list(.)
mouseid.list
## $mouse.id
## [1] "18_53_M" "18_45_M" "18_46_F" "18_47_F" "24_60_M" "24_59_M" "24_58_M"
## [8] "24_61_M" "3_8_M" "3_9_M" "3_10_M" "3_11_M" "3_56_F" "3_38_F"
## [15] "3_39_F"
To reduce the computation, we will only choose top genes with the most variance for further analysis.
## Calculate the variance, sort the gene based on the variance, and get the top variant genes
Gene.Var <- RowVar((mtx.sparse)) %>%
sort(decreasing = TRUE)
Gene.subset <- Gene.Var[1:22075]
## Subset data based on the gene index
mtx.sparse.subset <- mtx.sparse[names(Gene.subset),]
## remove minor cell populations, because in some mouse, there are only few cells from those population
mouse.list <- c("18_53_M", "18_45_M", "18_46_F",
"18_47_F", "24_60_M", "24_59_M",
"24_58_M", "24_61_M", "3_8_M",
"3_9_M", "3_10_M", "3_11_M",
"3_56_F", "3_38_F", "3_39_F" )
annot.subset <- annot %>%
filter(!cell_ontology_class %in% minor_cell_populaiton)
pseudobulk.mouse <- list()
for (imouse in mouse.list) {
test.mouse <- annot.subset %>%
filter(mouse.id == {{imouse}})
celltype <- as.factor(test.mouse$cell_ontology_class)
names(celltype) <- test.mouse$index
output <- getPseudobulk(mtx.sparse.subset, celltype)
output.df <- as.data.frame(output)
output.df$all <- rowSums(output.df)
pseudobulk.mouse[[imouse]] <- output.df
}
pseudobulk.cell <- list()
for (icell in major_cell_population) {
output <- data.frame(symbol=rownames(mtx.sparse.subset))
for (imouse in mouse.list) {
tmpdf <- data.frame(tmp = pseudobulk.mouse[[imouse]][,icell])
colnames(tmpdf) <- imouse
output <- cbind(output,tmpdf)
}
pseudobulk.cell[[icell]] <- output
}
for (icell in major_cell_population) {
save_csv(pseudobulk.cell[[icell]],output_dir,
paste(method,tissue,paste0(icell,"pseudobulk",".csv"),sep="_"))
}
Get differential genes among ages for each cell types
Get meta data from annotation files and make it as colData
object for
use in DESeq
.
meta <- annot %>%
select(mouse.id) %>%
mutate(id = mouse.id) %>%
distinct %>%
separate(id,c("month","id","gender"))
rownames(meta) <- meta$mouse.id
meta$month.factor <- factor(meta$month)
meta$gender.factor <- factor(meta$gender)
Read the count matrix by DESeqDataSetFromMatrix()
.
dds.list <- list()
for (icell in major_cell_population) {
input <- pseudobulk.cell[[icell]]
test.df <- input[,2:ncol(input)] %>%
mutate_if(is.numeric,round)
rownames(test.df) <- input$symbol
test.mtx <- as.matrix(test.df)
dds.list[[icell]] <- DESeqDataSetFromMatrix(countData = test.mtx,
colData = meta,
design = ~ month.factor)
}
Perform differential analysis by comparing 18 vs 3 months and 24 vs 3 months. Also get the count matrix (We already normalized it before, so we don’t normalize it again)
res.list <- list()
for (icell in major_cell_population) {
dds <- DESeq(dds.list[[icell]])
for (imonth in c("18","24")) {
res <- as.data.frame(results(dds,contrast = c("month.factor",imonth,"3")))
res <- checkDEG(res, 0.05, 0.5)
res.list[[icell]][[imonth]] <- res
}
}
## combine all counts
count.list <- list()
for (icell in major_cell_population) {
input <- dds.list[[icell]]
input <- estimateSizeFactors(input)
count.list[[icell]] <- as.data.frame(counts(input, normalized=TRUE))
}
df.list <- list()
for (icell in major_cell_population) {
df <- as.data.frame(t(count.list[[icell]]))
df$mouse.id <- rownames(df)
df <- df %>%
gather(symbol, count, -(mouse.id)) %>%
separate(mouse.id, c("month","id","gender")) %>%
mutate(mouse.id = paste(month,id,gender,sep="_")) %>%
left_join(res.list[[icell]][["18"]][,c("symbol","log2FoldChange","DEG")],by="symbol") %>%
left_join(res.list[[icell]][["24"]][,c("symbol","log2FoldChange","DEG")],by="symbol") %>%
mutate(DEG.both = ifelse((DEG.x != "None") & (DEG.y != "None"), TRUE, FALSE)) %>%
mutate(abs.log2FC.18 = abs(log2FoldChange.x)) %>%
mutate(abs.log2FC.24 = abs(log2FoldChange.y))
colnames(df) <- c("month","id","gender","symbol","count","mouse.id",
"log2FC.18","DEG.18","log2FC.24","DEG.24","DEG.both",
"abs.log2FC.18","abs.log2FC.24")
save_csv(df,output_dir,paste(method,tissue,icell,paste0("final_df",".csv"),sep="_"))
df.list[[icell]] <- df
}
We perform GSEA for data from differential analysis. Keep in mind we may not have significant results since the number of differential genes in each cell type are low.
Load pathways from database.
# GO_collections <- msigdbr_collections()
# split() comes from base R
c2cpKegg_gene_sets = msigdbr(species = "Mus musculus", category = "C2", subcategory = "CP:KEGG")
c2cpKegg_gene_sets_list = split(x = c2cpKegg_gene_sets$gene_symbol, f = c2cpKegg_gene_sets$gs_name)
c2cpReactome_gene_sets = msigdbr(species = "Mus musculus", category = "C2", subcategory = "CP:REACTOME")
c2cpReactome_gene_sets_list = split(x = c2cpReactome_gene_sets$gene_symbol, f = c2cpReactome_gene_sets$gs_name)
c5bp_gene_sets = msigdbr(species = "Mus musculus", category = "C5", subcategory = "GO:BP")
c5bp_gene_sets_list = split(x = c5bp_gene_sets$gene_symbol, f = c5bp_gene_sets$gs_name)
# c5mf_gene_sets = msigdbr(species = "Mus musculus", category = "C5", subcategory = "GO:MF")
# c5mf_gene_sets_list = split(x = c5mf_gene_sets$gene_symbol, f = c5mf_gene_sets$gs_name)
gene_sets_list <- c(c2cpKegg_gene_sets_list, c2cpReactome_gene_sets_list, c5bp_gene_sets_list)
# Remove duplicated genes in the list
for(i in 1:length(gene_sets_list)){
gene_sets_list[[i]] <- unique(gene_sets_list[[i]])
}
Now we perform GSEA and save the data for Tableau.
## Create gene list and order it by log2FC.24
DEG.list <- list()
for (icell in major_cell_population) {
DEG.list[[icell]] <- df.list[[icell]] %>%
select(symbol,log2FC.24) %>%
distinct(symbol,log2FC.24) %>%
mutate(log2FoldChange = log2FC.24) %>%
arrange(desc(log2FoldChange))
}
pathways <- gene_sets_list
## long process
GO.list <- list()
for (icell in major_cell_population) {
GO.list[[icell]] <- get_GO(DEG.list[[icell]],pathway)
}
for (icell in major_cell_population) {
output_df <- GO.list[[icell]][,-c("leadingEdge")] %>%
mutate(log10FDR = -log10(padj))
save_csv(output_df,output_dir, paste(method,tissue,icell,paste0("pathway",".csv"),sep="_"))
}