-
Notifications
You must be signed in to change notification settings - Fork 1
/
pathway-enrichment.Rmd
327 lines (229 loc) · 10.5 KB
/
pathway-enrichment.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
---
title: "Pathway Enrichment Analysis"
description: |
This script performs pathway enrichment analyses using the Gene Ontology and Reactome databases.
bibliography: astrocyte-review-bibliography.bib
csl: https://www.zotero.org/styles/elsevier-vancouver
author:
- first_name: "Ayush"
last_name: "Noori"
url: https://www.github.com/ayushnoori
affiliation: Massachusetts General Hospital
affiliation_url: https://www.serranopozolab.com
orcid_id: 0000-0003-1420-1236
output:
distill::distill_article:
toc: true
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(eval = FALSE)
```
# Dependencies
Load requisite packages. Note that the `tibble` package is not loaded (to preempt namespace conflicts) but the `tibble::add_row` function is explicitly called.
This script also uses my personal utilities package `brainstorm`, which can be downloaded via `devtools::install_github("ayushnoori/brainstorm")`.
```{r load-packages, message=FALSE, warning=FALSE}
# data manipulation
library(data.table)
library(purrr)
library(magrittr)
# string operations
library(stringr)
# Excel manipulation
library(openxlsx)
# data visualization
library(ggplot2)
library(ggpubr)
# utility functions
library(brainstorm)
```
Note that directories are relative to the R project path.
```{r define-directores}
# set directories
ddir = file.path("Data", "2 - Pathway Enrichment Analysis")
dir2 = file.path("Results", "2 - Pathway Enrichment Analysis")
```
# Read Data
Read data, then input the gene symbols to <http://www.gsea-msigdb.org/gsea/msigdb/annotate.jsp>. Compute overlaps separately against the Gene Ontology (`GO:BP: GO biological process`, `GO:CC: GO cellular component`, `GO:MF: GO molecular function`) [@ashburner_gene_2000; @the_gene_ontology_consortium_gene_2019] and Reactome (`CP:REACTOME: Reactome gene sets`) [@jassal_reactome_2020] databases available from the Molecular Signatures Database [@liberzon_molecular_2011]. For each analysis, show the `top 100` gene sets with FDR *q*-value less than `1` (i.e., no *q*-value cutoff). Place output files in the `Data/2 - Pathway Enrichment Analysis` directory.
```{r read-data}
# read data
markers = fread(file.path("Data", "ADRA Protein Set.csv"))
# cat(markers[, Symbol])
```
Read MSigDB mapping file (version 7.2), which was downloaded as an XML file from <https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.2/msigdb_v7.2.xml>, parsed in Excel, and filtered for GO/Reactome pathways only.
```{r read-mapping}
# read mapping file
mapDB = fread(file.path(ddir, "MSigDB Mapping v7.3.csv")) %>%
.[, .(STANDARD_NAME, EXACT_SOURCE, MEMBERS_SYMBOLIZED, MEMBERS)]
setnames(mapDB, c("Pathway", "ID", "Symbol", "ENTREZ"))
```
# Parse Pathway Enrichment Results
Define Excel styles for workbook object.
```{r define-styles}
# header style
hs = createStyle(fontColour = "#FAF3DD", fgFill = "#337CA0",
fontName = "Arial Black", halign = "center",
valign = "center", textDecoration = "Bold",
border = "Bottom", borderStyle = "thick", fontSize = 14)
# row style #1
r1 = createStyle(fontColour = "#363635", fgFill = "#FAF3DD",
fontName = "Arial", fontSize = 10)
# row style #2
r2 = createStyle(fontColour = "#363635", fgFill = "#C4E4E9",
fontName = "Arial", fontSize = 10)
# subheader style
sh = createStyle(fontColour = "#363635", fgFill = "#FFA69E",
fontName = "Arial", textDecoration = "Bold",
border = "TopBottom", borderStyle = "thick")
```
Define function to add sheet to workbook, where `clus` is a vector of cluster labels, `sheet` is the worksheet name, and `header` is the vector of header indices.
```{r add-sheet}
add_sheet = function(datWB, clus, sheet, header, wb) {
# add worksheet
addWorksheet(wb, sheetName = sheet)
writeDataTable(wb, sheet, x = datWB, tableStyle = "TableStyleMedium15", headerStyle = hs, bandedRows = FALSE)
setColWidths(wb, sheet, cols = 1:10, widths = c("auto", 60, 8, 8, 8, 16, 16, 20, 20, 200))
freezePane(wb, sheet, firstRow = TRUE, firstCol = FALSE)
# add styling
even = clus %% 2 == 0; even[is.na(even)] = FALSE
addStyle(wb, sheet, r1, rows = which(even) + 1, cols = 1:10, gridExpand = T)
addStyle(wb, sheet, r2, rows = which(!even) + 1, cols = 1:10, gridExpand = T)
addStyle(wb, sheet, sh, rows = header + 1, cols = 1:10, gridExpand = T)
}
```
Parse, cluster, and tabulate pathway enrichment results. The similarity matrix is created by computing the Jaccard similarity coefficient, then converted to Jaccard distance by subtracting from 1.
```{r parse-pathways}
# function to compute Jaccard similarity coefficient
jaccard = function(a, b) {return(length(intersect(a,b))/length(union(a, b)))}
# read pathway enrichment results
parse_pathways = function(fname, wb) {
# read file
dat = fread(file.path(ddir, fname))
setnames(dat, c("Pathway", "K", "Description", "k", "Ratio", "p", "q"))
# merge with mapping, clean pathway names, get member list
dat = merge(dat, mapDB, by = "Pathway", all.x = TRUE) %>%
.[, Pathway := gsub("_", " ", Pathway)] %>%
.[, Pathway := sub(".*? ", "", Pathway)] %>%
.[, Members := strsplit(ENTREZ, ",")]
# create pairwise similarity matrix using Jaccard index
sim = dat[, Members] %>% outer(., ., FUN = Vectorize(jaccard))
rownames(sim) = dat[, ID]; colnames(sim) = dat[, ID]
# specify number of clusters
nclust = 15
# h = 0.95
# convert to dissimilarity matrix, then perform hierarchical clustering
pclust = as.dist(1 - sim) %>%
hclust() %>%
cutree(k = nclust) %>%
# cutree(h = 0.95) %>%
data.table(ID = names(.), Cluster = .)
# get number of clusters
# nclust = pclust[, length(unique(Cluster))]
# print(nclust)
# merge with cluster, then group by cluster and order by q-value
dat = merge(dat, pclust, by = "ID", all.x = TRUE) %>%
.[order(Cluster, q), ] %>%
.[, .(ID, Pathway, K, k, Ratio, p, q, Cluster, Symbol, ENTREZ, Description)]
# rename columns for Excel workbook
datWB = copy(dat)
setnames(datWB, c("ID", "Pathway", "K (Genes in Pathway)",
"k (Genes in Overlap)", "k/K", "p-value", "FDR q-value",
"Cluster", "Symbol", "ENTREZ", "Description"))
header = datWB[, which(!duplicated(Cluster))] + 0:(nclust - 1)
# add rows for manual annotatioN
for (z in header) {
datWB = tibble::add_row(datWB, .before = z)
datWB[z, ID := paste0("Pathway #", datWB[z + 1, "Cluster"])]
}
# get cluster and sheet name, add to workbook
clus = datWB[, Cluster]; datWB[, Cluster := NULL]
sheet = fname %>% gsub("\\s+Enrichment\\.csv$", "", .)
add_sheet(datWB, clus, sheet, header, wb)
# return original data
return(dat)
}
```
Map `pathway_results` function over each analysis (i.e., GO: BP, GO: CC, GO: MF, and Reactome).
```{r map-pathway}
# get file names
files = list.files(path = ddir, pattern = "Enrichment\\.csv$")
# create workbook object
grpWB = createWorkbook()
# map over file names
pathways = map(files, ~parse_pathways(.x, grpWB))
```
Save workbook object.
```{r save-workbook}
# define file paths
raw = file.path(dir2, "Pathway Enrichment Analysis Raw.xlsx")
annot = file.path(dir2, "Pathway Enrichment Analysis Annotated.xlsx")
# save workbooks
saveWorkbook(grpWB, raw, overwrite = TRUE)
if(!file.exists(annot)) { file.copy(raw, annot) }
```
# Aggregate Annotated Pathways
Complete pathway annotations by manually editing the file `Pathway Enrichment Analysis Annotated.xlsx`. After annotations are complete, execute the following chunks which compute cluster statistics.
```{r compute-cluster}
compute_cluster = function(sheet) {
# read data
dat = read.xlsx(annot, sheet = sheet, check.names = TRUE) %>% as.data.table()
setnames(dat, c("ID", "Pathway", "K", "k", "Ratio", "p", "q",
"Symbol", "ENTREZ", "Description"))
# get header indices
header = dat[, which(is.na(Description))]
# extract cluster labels
clus = dat[header, .(ID, Pathway)]
nclus = c(header[-1], nrow(dat) + 1) - (header + 1)
# re-create cluster labels
dat = dat[-header, ][, Cluster := rep(clus$ID, nclus)]
# compute cluster statistics
clusdat = dat[, .(Ratio = sum(k)/sum(K), logQ = mean(-log10(q))), by = Cluster]
clus = merge(clus, clusdat, by.x = "ID", by.y = "Cluster")[, Database := sheet]
return(clus)
}
```
Map `compute_cluster` function over each sheet of the annotations workbook (i.e., GO: BP, GO: CC, GO: MF, and Reactome) to compute cluster statistics.
```{r map-cluster}
# get file names
sheets = getSheetNames(annot)
# map over file names
clusters = map_dfr(sheets, compute_cluster)
# refactor cluster database labels and order by -log10(FDR q-value)
clusters = clusters %>%
.[, ID := NULL] %>%
.[, Database := factor(
Database,
levels = c("GO BP", "GO CC", "GO MF", "Reactome"),
labels = c("GO: Biological Process", "GO: Cellular Component",
"GO: Molecular Function", "Reactome"))] %>%
.[order(Database, logQ), ]
```
Function to plot pathway data in a barplot for each enrichment analysis (i.e., for each database). Note that `facet_wrap` is used for visual purposes only. Each plot created has a single facet.
```{r plot-pathways}
plot_pathways = function(datDB, DB) {
datDB[, Pathway := factor(Pathway, levels = Pathway)]
p = ggplot(datDB, aes(x = Pathway, y = logQ))+ # fill = Ratio
geom_col() +
coord_flip() +
# scale_fill_gradient(low="#FFD166", high="#A63446") +
facet_wrap(Database ~ ., scales = "free", ncol = 1) +
# labs(fill = "Gene Ratio") +
scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.05))) +
theme_light() +
theme(plot.title = element_text(size = 16, color = "black", face = "bold", hjust = 0.5),
axis.title.x = element_blank(), axis.title.y = element_blank(),
legend.title = element_text(face = "bold"),
strip.text = element_text(size=12, color = "black", face="bold"),
strip.background = element_rect(color=NA, fill="#D9D9D9", size=1, linetype="solid"))
return(p)
}
```
Apply function over each database to create aggregated plot.
```{r plot-all}
# map over each database
cplots = imap(split(clusters, by = "Database"), plot_pathways) %>%
ggarrange(plotlist = ., ncol = 1, nrow = 4, align = "hv") %>%
annotate_figure(bottom = text_grob(bquote(bold(-log[10]*'(FDR '*bolditalic(q)*'-value)')),
size = 16, color = "black", hjust = 0))
# save figure
ggsave(file.path(dir2, "Pathway Enrichment Analysis Barplot.pdf"), cplots, height = 16, width = 12)
```