-
Notifications
You must be signed in to change notification settings - Fork 0
/
report.Rmd
392 lines (350 loc) · 14.9 KB
/
report.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
---
title: "Report of Parameters used with V-SVA"
output: html_document
date: "`r Sys.time()`"
fig_width: 12
fig_height: 12
params:
dt: NA
---
##Expression Data and Sample Metadata Provided
```{r}
# dimensions of expression matrix
if (!is.null(params$dt$exp_norm)) {
dim(params$dt$exp_norm)
}
# dimensions of metadata, if not provided only Genes_Detected and Log_Total_Counts determined
if (!is.null(params$dt$meta_df)) {
dim(params$dt$meta_df)
head(params$dt$meta_df)
}
```
##Quality Control of Expression Data: Number of Features Detected in each Sample
```{r, eval=FALSE}
# binarize data to determine which features are detected in each sample
bin_data <- params$dt$exp_norm
bin_data[bin_data < 1] <- 0
bin_data[bin_data >= 1] <- 1
num.exp <- apply(bin_data,2,sum)
params$dt$detect_num <- num.exp
```
```{r}
if (!is.null(params$dt$detect_num)) {
summ <- summary(params$dt$detect_num)
# histogram of number of features detected in each sample
hist(params$dt$detect_num, col = "dodgerblue", main="",
ylab = "Samples (n)", xlab = "Number of features detected in each sample")
legend("topright", legend = paste(names(summ), round(summ, digits = 2), sep = " "), title = "Summary of features detected")
}
```
##Feature Pre-processing Options (normalization, sample and feature filtering)
#### Remove samples based on number of detected features
```{r}
# were cells filtered based on genes detected?
if (params$dt$cell_filter_choice) {
print(paste("Removing cells that have less than ", params$dt$cellfilt_number, " features detected,", sep = ""))
} else {
print("Cells were not filtered based on features detected")
}
```
```{r, eval=FALSE}
# code to filter cells based on features detected
num.sel <- params$dt$detect_num[params$dt$detect_num >= params$dt$cellfilt_number]
# subset data
params$dt$exp_norm <- params$dt$exp_norm[, names(num.sel)]
params$dt$meta_df <- params$dt$meta_df[names(num.sel), ]
```
#### Down-sample the number of samples included in the analysis (to increase computational efficiency)
```{r}
# were cells down-sampled to a certain number to speed up computation?
if (params$dt$cell_downsample_choice) {
print(paste("Cells were down-sampled to ", params$dt$cell_downsample_number, " cells", sep = ""))
} else {
print("Cells were not down-sampled")
}
```
```{r, eval=FALSE}
# code to down-sample cells
set.seed(1)
dw_samp <- base::sample(x = 1:ncol(params$dt$exp_norm), size = params$dt$cell_downsample_number)
# subset data
params$dt$exp_norm <- params$dt$exp_norm[, dw_samp]
params$dt$meta_df <- params$dt$meta_df[dw_samp, ]
```
#### Remove features not detected in samples
```{r}
# were features filtered?
if (params$dt$gene_filter_choice) {
print(paste("Features were filtered using ",
params$dt$gene_filter_method,
".", nrow(params$dt$exp_norm), " Features with ", params$dt$gene_count_num,
" or more counts in at least ", params$dt$gene_cell_num,
" cells were retained" , sep = ""))
} else {
print("Features were not filtered based on detection rate in samples")
}
```
#### Choose normalization method for expression data
```{r, eval=FALSE}
# code for gene filtering methods: all code is displayed regardless of method chosen
filter <- apply(params$dt$exp_norm, 1, function(x) length(x[x>isolate(input$Count_num)])>=isolate(input$Cell_num))
params$dt$exp_norm <- params$dt$exp_norm[filter,]
# normalize the data
# using CPM method
if (isolate(input$norm_method) == "CPM") {
params$dt$exp_norm <- edgeR::cpm(params$dt$exp_norm)
# using quantile normalization method
} else if (isolate(input$norm_method) == "Quantile") {
params$dt$exp_norm <- normalize.quantiles(params$dt$exp_norm)
# scran method
} else if (isolate(input$norm_method) == "scran") {
sce <- SingleCellExperiment(list(counts=params$dt$exp_norm))
sce <- computeSumFactors(sce)
sce <- normalize(sce)
params$dt$exp_norm <- exprs(sce)
# no normalization
} else if (isolate(input$norm_method) == "None") {
params$dt$exp_norm <- params$dt$exp_norm
}
```
##Surrogate Variable Analysis (SVA)
```{r}
# which method was used
if (!is.null(params$dt$sva_method_use)) {
print(paste("The surrogate variable analysis method chosen was: ", params$dt$sva_method_use, sep = ""))
}
# which known factors were adjusted for
if (!is.null(params$dt$known_factors_use)) {
print(paste("The following known factor(s) were adjusted for: ", params$dt$known_factors_use, sep = ""))
}
# the number of SV's
if (!is.null(params$dt$iasva.res)) {
print(paste("The number of SV's/Factors identified: ", ncol(params$dt$iasva.res$sv), sep = ""))
}
```
```{r, eval=FALSE}
# code for SVA methods
# create model matrix with known factors to adjust for
id_mod <- which(colnames(params$dt$meta_df) %in% params$dt$known_factors_use)
if (length(id_mod) > 1) {
formdf1 <- as.formula(paste("~", colnames(params$dt$meta_df)[id_mod][1], "+", paste(colnames(params$dt$meta_df)[id_mod[2:length(id_mod)]],collapse="+"), sep = ""))
mod <- model.matrix(formdf1, data = params$dt$meta_df)
} else {
varf1 <- as.factor(params$dt$meta_df[, id_mod])
mod <- model.matrix(~varf1, data = params$dt$meta_df)
}
# create summarized experiment for expression matrix to later use for marker gene identification
summ_exp <- SummarizedExperiment(assays = as.matrix(params$dt$exp_norm))
params$dt$summ_exp <- summ_exp
# if user chose IA-SVA, then perform following
if (isolate(params$dt$sva_method_use == "IA-SVA")) {
# depending on which ia-sva parameters were chosen, evaluate
if (isolate(input$iasva_param == "Percentage Threshold")) {
params$dt$iasva.res <- fast_iasva(summ_exp, mod[,-1, drop = F], verbose=FALSE,
pct.cutoff = isolate(input$pct_cutt), num.sv = NULL)
} else if (isolate(input$iasva_param == "Number of SVs")) {
params$dt$iasva.res <- fast_iasva(summ_exp, mod[,-1, drop = F], verbose=FALSE,
pct.cutoff = isolate(input$pct_cutt), num.sv = isolate(input$num_of_svs))
}
# else if choose SVA method
} else if (isolate(params$dt$sva_method_use == "SVA")) {
# perform sva analysis with specified svs
sva.res <- svaseq(params$dt$exp_norm, mod = mod, mod0 = mod[,1], n.sv = isolate(input$sva_num))
colnames(sva.res$sv) <- paste("SV", 1:ncol(sva.res$sv), sep = "")
params$dt$iasva.res <- sva.res
# else if choose zinb-wave method
} else if (isolate(params$dt$sva_method_use == "ZINB-WaVE")) {
# perform analysis with specified latent factors
zinb.matrix <- params$dt$exp_norm
# coerce to integer
mode(zinb.matrix) <- "integer"
zinb.res <- zinbFit(Y = zinb.matrix, X = mod[,-1, drop = F], K = isolate(input$zinb_num))
# extract factors
zinb.fac <- getW(zinb.res)
colnames(zinb.fac) <- paste("SV", 1:ncol(zinb.fac), sep = "")
params$dt$iasva.res <- list(sv = zinb.fac)
}
```
## Correlation Plot of SV's and Sample Metadata
```{r}
# change factors to numeric for correlation
if (!is.null(params$dt$meta_df) & !is.null(params$dt$iasva.res)) {
meta_sel <- params$dt$meta_df
for (jcol in 1:ncol(meta_sel)) {
meta_sel[,jcol] <- as.numeric(as.factor(meta_sel[,jcol]))
}
iasva_vars <- cbind(params$dt$iasva.res$sv, meta_sel)
# need to append column names to matrix
colnames(iasva_vars) <- c(paste("SV", 1:ncol(params$dt$iasva.res$sv), sep = ""),
colnames(params$dt$meta_df))
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(abs(cor(iasva_vars)), type = "upper", method = "color",
col = col(200), number.cex = 1,
addCoef.col = "black",
tl.col = "black", tl.srt = 90, diag = FALSE)
}
```
## Paired SV Plots
```{r}
if (!is.null(params$dt$iasva.res)) {
iasva.sv <- as.data.frame(params$dt$iasva.res$sv)
rownames(iasva.sv) <- colnames(params$dt$exp_norm)
pairs(iasva.sv, main="", pch=20, cex=0.5, lower.panel = NULL)
}
```
## Interactive Paired SV Plots
```{r}
if (!is.null(params$dt$iasva.res)) {
iasva.sv <- as.data.frame(params$dt$iasva.res$sv)
rownames(iasva.sv) <- colnames(params$dt$exp_norm)
plot_ly(iasva.sv, x = ~SV1, y = ~SV2, type = "scatter",
mode = "markers", text = paste("Cell ID: ", rownames(iasva.sv), sep = ""),
marker = list(
opacity = 0.5
)
)
}
```
## Identifying Marker Features associated with SV's
```{r, eval=FALSE}
# identify which svs were chosen for marker analysis
id_sv_mark <- which(colnames(params$dt$iasva.res$sv) %in% isolate(input$SV_marks))
marker_genes <- iasva::find_markers(Y = params$dt$summ_exp,
iasva.sv = as.matrix(params$dt$iasva.res$sv[, id_sv_mark, drop=FALSE]),
rsq.cutoff = isolate(input$rsqcutoff), method = isolate(input$mark_sig), sig.cutoff = isolate(input$mark_cutoff))
params$dt$markers <- marker_genes
```
## Heatmap of Marker Features Determined from SVA Analysis
```{r}
if (!is.null(params$dt$markers)) {
all_marks <- params$dt$markers$All_Unique_Markers
log_mat <- log(as.matrix(params$dt$exp_norm[all_marks,])+1)
# remove any NA's from matrix
log_mat <- log_mat[complete.cases(log_mat),]
pheatmap(log_mat, show_colnames = FALSE,
show_rownames = TRUE,
clustering_method = "ward.D2")
}
```
## Dimension Reduction
```{r, eval=FALSE}
set.seed(1)
# transpose matrix
trans_orig <- t(log(params$dt$exp_norm+1))
# remove any zeros
params$dt$pre_dim_orig <- trans_orig[, apply(trans_orig, 2, var, na.rm = TRUE) != 0]
# Principal component analysis (PCA) for all genes
dim_orig <- prcomp(x = params$dt$pre_dim_orig, center = TRUE, scale. = TRUE)
dim_orig_mat <- dim_orig$x
rownames(dim_orig_mat) <- colnames(params$dt$exp_norm)
params$dt$dim_orig <- as.data.frame(dim_orig_mat)
### Code for T-SNE ####
# T-SNE analysis (PCA) for all genes
# not comupted
# dim_orig <- Rtsne(X = params$dt$pre_dim_orig, dims = 3)
# dim_orig_mat <- dim_orig$Y
#### Code for MDS ###
# dim_orig <- cmdscale(d = dist(params$dt$pre_dim_orig), k = 3)
# dim_orig_mat <- dim_orig
# rownames(dim_orig_mat) <- colnames(params$dt$exp_norm)
# colnames(dim_orig_mat) <- c("MDS1", "MDS2", "MDS3")
# Principal component analysis for SV-selected genes
# transpose matrix
trans_mark <- t(params$dt$exp_norm[params$dt$markers_formatted[,1],])
# remove any zeros
params$dt$pre_dim_mark <- trans_mark[, apply(trans_mark, 2, var, na.rm = TRUE) != 0]
dim_mark <- prcomp(x = params$dt$pre_dim_mark, center = TRUE, scale. = TRUE)
dim_mark_mat <- dim_mark$x
rownames(dim_mark_mat) <- colnames(params$dt$exp_norm)
params$dt$dim_mark <- as.data.frame(dim_mark_mat)
```
### Dimension Reduction Visualization (using all features)
```{r}
if (!is.null(params$dt$dim_method)) {
# print your dimension reduction method
print(paste("Dimension reduction method chosen: ", params$dt$dim_method, sep = ""))
# if chosen PCA
if (params$dt$dim_method == "PCA") {
# 3D interactive dimension reduction plot using all features
plot_ly(params$dt$dim_orig, x = ~PC1, y = ~PC2, z = ~PC3, type = "scatter3d",
mode = "markers", text = paste("Cell ID: ", rownames(params$dt$dim_orig), sep = ""),
marker = list(
opacity = 0.5
)) %>% layout(title = paste("All Genes (n = ", nrow(params$dt$exp_norm), ")", sep = ""))
} else if (params$dt$dim_method == "t-SNE") {
plot_ly(params$dt$dim_orig, x = ~tSNE1, y = ~tSNE2, z = ~tSNE3, type = "scatter3d",
mode = "markers", text = paste("Cell ID: ", rownames(params$dt$dim_orig), sep = ""),
marker = list(
opacity = 0.5
)) %>% layout(title = paste("All Genes (n = ", nrow(params$dt$exp_norm), ")", sep = ""))
} else if (params$dt$dim_method == "Classical Metric MDS") {
plot_ly(params$dt$dim_orig, x = ~MDS1, y = ~MDS2, z = ~MDS3, type = "scatter3d",
mode = "markers", text = paste("Cell ID: ", rownames(params$dt$dim_orig), sep = ""),
marker = list(
opacity = 0.5
)) %>% layout(title = paste("All Genes (n = ", nrow(params$dt$exp_norm), ")", sep = ""))
}
}
```
### Dimension Reduction Visualization (using SV-associated features)
```{r}
if (!is.null(params$dt$dim_method)) {
# 3D interactive dimension reduction plot using SV selected features
if (params$dt$dim_method == "PCA") {
plot_ly(params$dt$dim_mark, x = ~PC1, y = ~PC2, z = ~PC3, type = "scatter3d",
mode = "markers", text = paste("Cell ID: ", rownames(params$dt$dim_mark), sep = ""),
marker = list(
opacity = 0.5
)) %>% layout(title = paste(params$dt$sva_method_use, " Genes (n = ", nrow(params$dt$exp_norm[params$dt$markers_formatted[,1],]),
";",params$dt$chosen_svs, ")", sep = ""))
} else if (params$dt$dim_method == "t-SNE") {
plot_ly(params$dt$dim_mark, x = ~tSNE1, y = ~tSNE2, z = ~tSNE3, type = "scatter3d",
mode = "markers", text = paste("Cell ID: ", rownames(params$dt$dim_mark), sep = ""),
marker = list(
opacity = 0.5
)) %>% layout(title = paste(params$dt$sva_method_use, " Genes (n = ", nrow(params$dt$exp_norm[params$dt$markers_formatted[,1],]), "; ",
params$dt$chosen_svs, ")", sep = ""))
} else if (params$dt$dim_method == "Classical Metric MDS") {
plot_ly(params$dt$dim_mark, x = ~MDS1, y = ~MDS2, z = ~MDS3, type = "scatter3d",
mode = "markers", text = paste("Cell ID: ", rownames(params$dt$dim_mark), sep = ""),
marker = list(
opacity = 0.5
)) %>% layout(title = paste(params$dt$sva_method_use, " Genes (n = ", nrow(params$dt$exp_norm[params$dt$markers_formatted[,1],]), "; ",
params$dt$chosen_svs, ")", sep = ""))
}
}
```
## Gene Enrichment Analysis
```{r, eval=FALSE}
# convert gene symbols to Entrez ID (for example for human data)
gene.df <- bitr(gene, fromType = "SYMBOL",
toType = c("ENSEMBL", "ENTREZID"),
OrgDb = org.Hs.eg.db)
params$dt$species <- org.Hs.eg.db
params$dt$gene.df <- gene.df
# analysis for GO biological process terms (as an example)
ego <- enrichGO(gene = params$dt$gene.df$ENTREZID,
OrgDb = params$dt$species,
keyType = "ENTREZID",
ont = "BP",
pvalueCutoff = isolate(input$pvalue_cutoff), pAdjustMethod = isolate(input$pvalue_correct),
qvalueCutoff = isolate(input$path_cutoff),
minGSSize = 5,
readable = TRUE)
params$dt$enrich_res <- ego
params$dt$category_number <- input$path_viz_num
params$dt$pathway_name <- input$Path_Type
```
```{r}
# visualize pathway results
if (!is.null(params$dt$enrich_res)) {
print(paste("Type of gene enrichment analysis performed: ", params$dt$pathway_name, sep = ""))
print(paste("Show this many categories: ", params$dt$category_number, sep = ""))
dp <- clusterProfiler::dotplot(object = params$dt$enrich_res, showCategory = params$dt$category_number) + ggtitle(params$dt$pathway_name)
plot(dp)
}
```
## Session Information
```{r}
sessionInfo()
```