forked from mcrewcow/Suman_bulkRNAseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deseq2_to_pathways.R
137 lines (101 loc) · 4.21 KB
/
deseq2_to_pathways.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
if (!("clusterProfiler" %in% installed.packages())) {
# Install this package if it isn't installed yet
BiocManager::install("clusterProfiler", update = FALSE)
}
if (!("msigdbr" %in% installed.packages())) {
# Install this package if it isn't installed yet
BiocManager::install("msigdbr", update = FALSE)
}
if (!("org.Hs.eg.db" %in% installed.packages())) {
# Install this package if it isn't installed yet
BiocManager::install("org.Hs.eg.db", update = FALSE)
}
# Attach the library
library(clusterProfiler)
# Package that contains MSigDB gene sets in tidy format
library(msigdbr)
# Human annotation package we'll use for gene identifier conversion
library(org.Hs.eg.db)
# We will need this so we can use the pipe: %>%
library(magrittr)
wd <- "~/Documents/summer_research_2023-main/bulkrna/deseq2"
setwd(wd)
folders <- c("Control-vs-PMC",
"Control-vs-oxLDL-24h",
"Control-vs-oxLDL-48h",
"Control-vs-oxLDL-PMC-24h",
"Control-vs-oxLDL-PMC-48h")
#new column for control and test group average
for (f in folders){
csv_file <- read.csv(paste(wd, f, "Differential_expression_analysis_table.csv", sep="/"))
csv_file$ControlMean <- rowMeans(csv_file[,5:7])
csv_file$TestGroupMean <- rowMeans(csv_file[,8:10])
csv_file <- csv_file[,-5:-10]
write.csv(csv_file,file=paste(wd, f, "Differential_expression_analysis_table_new.csv", sep="/"), row.names=FALSE)
data <- csv_file
msigdbr_species()
mm_hallmark_sets <- msigdbr(
species = "Homo sapiens", # Replace with species name relevant to your data
category = "C5"
)
# First let's create a mapped data frame we can join to the differential
# expression stats
dge_mapped_df <- data.frame(
gene_symbol = mapIds(
# Replace with annotation package for the organism relevant to your data
org.Hs.eg.db,
keys = data$ID,
# Replace with the type of gene identifiers in your data
keytype = "ENSEMBL",
# Replace with the type of gene identifiers you would like to map to
column = "SYMBOL",
# This will keep only the first mapped value for each Ensembl ID
multiVals = "first"
)
) %>%
# If an Ensembl gene identifier doesn't map to a gene symbol, drop that
# from the data frame
dplyr::filter(!is.na(gene_symbol)) %>%
# Make an `Ensembl` column to store the rownames
tibble::rownames_to_column("Ensembl") %>%
# Now let's join the rest of the expression data
dplyr::inner_join(data, by = c("Ensembl" = "ID"))
dup_gene_symbols <- dge_mapped_df %>%
dplyr::filter(duplicated(gene_symbol)) %>%
dplyr::pull(gene_symbol)
dge_mapped_df %>%
dplyr::filter(gene_symbol %in% dup_gene_symbols) %>%
dplyr::arrange(gene_symbol)
filtered_dge_mapped_df <- dge_mapped_df %>%
# Sort so that the highest absolute values of the log2 fold change are at the
# top
dplyr::arrange(dplyr::desc(abs(log2FoldChange))) %>%
# Filter out the duplicated rows using `dplyr::distinct()`
dplyr::distinct(gene_symbol, .keep_all = TRUE)
print(paste("duplicated =", any(duplicated(filtered_dge_mapped_df$Gene.name))))
# Let's create a named vector ranked based on the log2 fold change values
lfc_vector <- filtered_dge_mapped_df$log2FoldChange
names(lfc_vector) <- filtered_dge_mapped_df$gene_symbol
# We need to sort the log2 fold change values in descending order here
lfc_vector <- sort(lfc_vector, decreasing = TRUE)
# Set the seed so our results are reproducible:
set.seed(2023)
gsea_results <- GSEA(
geneList = lfc_vector, # Ordered ranked gene list
minGSSize = 25, # Minimum gene set size
maxGSSize = 2000, # Maximum gene set set
pvalueCutoff = 0.01, # p-value cutoff
eps = 0, # Boundary for calculating the p value
seed = TRUE, # Set seed to make results reproducible
pAdjustMethod = "BH", # Benjamini-Hochberg correction
TERM2GENE = dplyr::select(
mm_hallmark_sets,
gs_name,
gene_symbol
)
)
# We can access the results from our `gsea_results` object using `@result`
gsea_result_df <- data.frame(gsea_results@result)
# Then write to file
write.csv(gsea_result_df,file=paste(wd, f, "GSEA_result.csv", sep="/"), row.names=FALSE)
}