-
Notifications
You must be signed in to change notification settings - Fork 15
/
synthetic_subsample.R
340 lines (308 loc) · 12 KB
/
synthetic_subsample.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
library(DESeq2)
library(edgeR)
library(parallel)
library(NOISeq)
myedger = function(count1, count2, q = 0.05){
n1 = ncol(count1)
n2 = ncol(count2)
cond_idx = rep(2, n1 + n2)
cond_idx[1:n1] = 1
dat = cbind(count1, count2)
if(!is.factor(cond_idx)){
cond_idx = factor(cond_idx)
}
y <- DGEList(counts=dat,group=cond_idx)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
count_norm = cpm(y)
design <- model.matrix(~cond_idx)
y <- estimateDisp(y,design)
fit <- glmQLFit(y,design)
qlf <- glmQLFTest(fit,coef=2)
qlf_i = topTags(qlf, n = nrow(dat), p.value = 1)@.Data[[1]]
pvalues = qlf_i[match(rownames(count1), rownames(qlf_i)),"PValue"]
qlf <- topTags(qlf, n = nrow(dat), p.value = q)
discovery = rownames(qlf)
output <- list(discovery, pvalues)
return(output)
}
mylimma = function(count1, count2, q = 0.05){
n1 = ncol(count1)
n2 = ncol(count2)
cond_idx = rep(2, n1 + n2)
cond_idx[1:n1] = 1
dat = cbind(count1, count2)
if(!is.factor(cond_idx)){
cond_idx = factor(cond_idx)
}
y <- DGEList(counts=dat,group=cond_idx)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
count_norm = cpm(y)
design <- model.matrix(~cond_idx)
v <- voom(y, design)
fit <- lmFit(v, design)
fit <- eBayes(fit)
res=topTable(fit, coef=2, sort.by="P", number=Inf, p.value = 1)
pvalues = res[match(rownames(count1), rownames(res)),"P.Value"]
discovery = rownames(res)[which(p.adjust(res$P.Value, method = "BH")< q)]
output <- list(discovery, pvalues)
return(output)
}
mydeseq2 = function(count1, count2, q=0.05){
n1 = ncol(count1)
n2 = ncol(count2)
cond_idx = rep(2, n1 + n2)
cond_idx[1:n1] = 1
dat = cbind(count1, count2)
rownames(dat) <- 1:nrow(dat)
if(!is.factor(cond_idx)){
cond_idx = factor(cond_idx)
}
dds <- DESeqDataSetFromMatrix(dat, DataFrame(cond_idx), ~ cond_idx)
dds <- DESeq(dds)
res <- results(dds, alpha = q)
pvalues <- res$padj
discovery = rownames(res)[which(res$padj <= q)]
return(list(discovery, pvalues))
}
my.wilcoxon <- function(score_exp, score_back){
p <- sapply(1:nrow(score_exp), function(j){
return(wilcox.test(as.numeric(score_exp[j,]), as.numeric(score_back[j,]), alternative = "two.sided")$p.value)
})
}
my.NOISeq <- function(readCount, conditions){
data<-NOISeq::readData(data=readCount, factors=as.data.frame(conditions))
res=noiseqbio(data, k=0.5, norm="tmm", factor="conditions",random.seed = 12345, filter = 1, cv.cutoff = 100, cpm = 1)
outputRst=degenes(res, M=NULL)
q = rep(NA, nrow(readCount))
q[as.numeric(rownames(outputRst))] = 1 - outputRst$prob
return(q)
}
my.ttest <- function(score_exp, score_back){
p <- sapply(1:nrow(score_exp), function(j){
return(t.test(as.numeric(score_exp[j,]), as.numeric(score_back[j,]), alternative = "two.sided")$p.value)
})
}
find_discovery_wpval = function(pval, q){
pval.adj = p.adjust(pval, method = 'BH')
discovery_ls = lapply(q, function(q_i){
discovery = which(pval.adj <= q_i)
})
return(discovery_ls)
}
compute_fdppow = function(discovery_ls, trueidx){
fdp = sum(!discovery_ls %in% trueidx )/max(length(discovery_ls),1)
pow = sum(discovery_ls %in% trueidx)/length(trueidx)
return(c(fdp, pow))
}
compute_fdppow2 = function(pvalues, trueidx, q){
index.p <- which(!is.na(pvalues))
is.trueDE <- 1:length(pvalues) %in% trueidx
n.dis <- cumsum(is.trueDE[order(pvalues)])
FDR.i = 1 - n.dis/1:length(pvalues)
index.i <- max(which(FDR.i<=q))
fdp <- FDR.i[index.i]
pow <- n.dis[index.i]/length(trueidx)
return(c(fdp, pow))
}
celltype.pairs <- c("AdiposeSubcutaneous.AdiposeVisceralOmentum", "BrainAmygdala.BrainSpinalcordCervicalc",
"CellsEVBtransformedLymphocytes.MinorSalivaryGland", "HeartAtrialAppendage.HeartLeftVentricle",
"WholeBlood.MuscleSkeletal", "Brain.Prostate")
tcga.pairs <- c('TCGA-BRCA', 'TCGA-KIRC', 'TCGA-LIHC', 'TCGA-LUAD', 'TCGA-PRAD', 'TCGA-THCA')
#n.sample <- c(5, 10, 15, 20, 25, 30, 35, 40)
n.sample <- c(2:10, 12, 15, 20, 25, 30, 35, 40)
for(j in 1:6){
count.mat <- read.table(paste0(tcga.pairs[j], '.normal-tumor.pair.rawCount.tsv.1'), sep = '\t', header = TRUE)
conditions <- read.table(paste0(tcga.pairs[j],'.conditions.tsv'), sep = '\t')
conditions <- substring(conditions[1,], 1, 1)
condition.names <- unique(conditions)
count1 <- count.mat[, which(conditions == condition.names[1])+1]
count2 <- count.mat[, which(conditions == condition.names[2])+1]
dataset <- cbind(count1, count2)
conditions2 <- c(rep("1", ncol(count1)),rep("2", ncol(count2)))
# y <- DGEList(dataset)
# y=calcNormFactors(y)
# cpmtmm=cpm(y)
# score_exp <- log(cpmtmm[,1:(ncol(count1))]+1)
# score_back <- log(cpmtmm[,-(1:(ncol(count1)))]+1)
# re_edger <- myedger(count1, count2, q = 0.000001)
# dis_edger <- re_edger[[1]]
# re_limma <- mylimma(count1, count2, q = 0.000001)
# dis_limma <- re_limma[[1]]
# re_deseq2 <- mydeseq2(count1, count2, q= 0.000001)
# dis_deseq2 <- re_deseq2[[1]]
# re_wilxocon <- my.wilcoxon(score_exp, score_back)
# re_wilxocon <- find_discovery_wpval(re_wilxocon, q = 0.000001)[[1]]
# re_noiseq <- my.NOISeq(dataset, conditions2)
# dis_noiseq <- which(re_noiseq <= 0.000001)
# dis0 <- list(dis_edger, dis_limma, dis_deseq2, re_wilxocon, dis_noiseq)
# saveRDS(dis0, paste0(tcga.pairs[j], '_dis.rds'))
dis0 <- readRDS(paste0(tcga.pairs[j], '_dis.rds'))
trueDE = intersect(intersect(intersect(intersect(dis0[[1]], dis0[[2]]), dis0[[3]]), dis0[[4]]), dis0[[5]])
ls_subsample <- lapply(n.sample, function(n){
print(n)
ls_onepair <- mclapply(1:25, function(it){
set.seed(it)
dataset0 <- dataset
index0 <- c(sample(1:ncol(count1), n), sample((ncol(count1)+1):ncol(dataset0), n))
dataset0 <- dataset[,index0]
trueDE0 <- sample(trueDE, length(trueDE)/2)
fdppow <- list()
fdppow.i <- list()
dataset0[-trueDE0,] <- t(apply(dataset0[-trueDE0, ], 1, sample))
#dataset0 <- t(apply(dataset0, 1, sample))
#dataset0 <- dataset0[, sample(1:ncol(dataset), ncol(dataset))]
count.perm1 <- dataset0[,1:n]
count.perm2 <- dataset0[,(n+1):(2*n)]
conditions2 <- c(rep("1", n),rep("2", n))
thres <- c(0.1, 0.05, 0.01, 0.001, 0.0001, 0.00001)
re_edger <- myedger(count.perm1, count.perm2)
dis_edger <- re_edger[[1]]
p_edger <- re_edger[[2]]
fdppow[[1]] <- sapply(1:6, function(j){
dis0 <- which(p.adjust(p_edger, method = "BH")<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[1]] <- sapply(1:6, function(j){
compute_fdppow2(p_edger, trueDE0, thres[j])
})
re_limma <- mylimma(count.perm1, count.perm2)
dis_limma <- re_limma[[1]]
p_limma <- re_limma[[2]]
fdppow[[2]] <- sapply(1:6, function(j){
dis0 <- which(p.adjust(p_limma, method = "BH")<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[2]] <- sapply(1:6, function(j){
compute_fdppow2(p_limma, trueDE0, thres[j])
})
re_deseq2 <- suppressMessages(try(mydeseq2(count.perm1, count.perm2)))
if (class(re_deseq2) == "try-error"){
fdppow[[3]] <- NA
fdppow.i[[3]] <- NA
}else{
fdppow[[3]] <-sapply(1:6, function(j){
dis0 <- which(re_deseq2[[2]] <=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[3]] <- sapply(1:6, function(j){
compute_fdppow2(re_deseq2[[2]], trueDE0, thres[j])
})
}
re_wilxocon <- my.wilcoxon(count.perm1, count.perm2)
fdppow[[4]] <- sapply(1:6, function(j){
dis0 <- which(p.adjust(re_wilxocon, method = "BH")<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[4]] <- sapply(1:6, function(j){
compute_fdppow2(re_wilxocon, trueDE0, thres[j])
})
re_noiseq <- suppressMessages(my.NOISeq(dataset0, conditions2))
fdppow[[5]] <- sapply(1:6, function(j){
dis0 <- which(re_noiseq<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[5]] <- sapply(1:6, function(j){
compute_fdppow2(re_noiseq, trueDE0, thres[j])
})
return(list(fdppow, fdppow.i))
}, mc.cores = 25)
return(ls_onepair)
})
saveRDS(ls_subsample, paste0(tcga.pairs[j], "subsample.rds"))
}
n.sample <- c(2:10, 12, 15, 20, 40, 60, 80, 100)
for(j in 1:6){
if (j <=5){
count.mat <- read.table(paste0(celltype.pairs[j], '.gene_readCount.tsv'), sep = '\t', header = TRUE)
conditions <- read.table(paste0(celltype.pairs[j],'.conditions.tsv'), sep = '\t')
conditions <- substring(conditions[1,], 1, 1)
condition.names <- unique(conditions)
count1 <- count.mat[, which(conditions == condition.names[1])+1]
count2 <- count.mat[, which(conditions == condition.names[2])+1]
}else{
Prostate <- read.table('Prostate.wtGeno.gene_readCount.tsv', sep = '\t', header = TRUE)
Brain <- read.table('Brain-Cortex.wtGeno.gene_readCount.tsv', sep = '\t', header = TRUE)
count1 <- Prostate[,-c(1:2)]
count2 <- Brain[,-c(1:2)]
conditions <- rep(2, ncol(count1) + ncol(count2))
conditions[1:ncol(count1)] = 1
conditions = factor(conditions)
}
dataset <- cbind(count1, count2)
dis0 <- readRDS(paste0(celltype.pairs[j], '_dis.rds'))
trueDE = intersect(intersect(intersect(intersect(dis0[[1]], dis0[[2]]), dis0[[3]]), dis0[[4]]), dis0[[5]])
ls_subsample <- lapply(n.sample, function(n){
ls_onepair <- mclapply(1:25, function(it){
set.seed(it)
dataset0 <- dataset
index0 <- c(sample(1:ncol(count1), n), sample((ncol(count1)+1):ncol(dataset0), n))
dataset0 <- dataset[,index0]
trueDE0 <- sample(trueDE, length(trueDE)/2)
fdppow <- list()
fdppow.i <- list()
dataset0[-trueDE0,] <- t(apply(dataset0[-trueDE0, ], 1, sample))
#dataset0 <- t(apply(dataset0, 1, sample))
#dataset0 <- dataset0[, sample(1:ncol(dataset), ncol(dataset))]
count.perm1 <- dataset0[,1:n]
count.perm2 <- dataset0[,(n+1):(2*n)]
thres <- c(0.1, 0.05, 0.01, 0.001, 0.0001, 0.00001)
conditions2 <- c(rep("1", n),rep("2", n))
re_edger <- myedger(count.perm1, count.perm2)
dis_edger <- re_edger[[1]]
p_edger <- re_edger[[2]]
fdppow[[1]] <- sapply(1:6, function(j){
dis0 <- which(p.adjust(p_edger, method = "BH")<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[1]] <- sapply(1:6, function(j){
compute_fdppow2(p_edger, trueDE0, thres[j])
})
re_limma <- mylimma(count.perm1, count.perm2)
dis_limma <- re_limma[[1]]
p_limma <- re_limma[[2]]
fdppow[[2]] <- sapply(1:6, function(j){
dis0 <- which(p.adjust(p_limma, method = "BH")<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[2]] <- sapply(1:6, function(j){
compute_fdppow2(p_limma, trueDE0, thres[j])
})
re_deseq2 <- try(mydeseq2(count.perm1, count.perm2))
if (class(re_deseq2) == "try-error"){
fdppow[[2]] <- NA
fdppow.i[[2]] <- NA
}else{
fdppow[[3]] <-sapply(1:6, function(j){
dis0 <- which(re_deseq2[[2]] <=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[3]] <- sapply(1:6, function(j){
compute_fdppow2(re_deseq2[[2]], trueDE0, thres[j])
})
}
re_wilxocon <- my.wilcoxon(count.perm1, count.perm2)
fdppow[[4]] <- sapply(1:6, function(j){
dis0 <- which(p.adjust(re_wilxocon, method = "BH")<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[4]] <- sapply(1:6, function(j){
compute_fdppow2(re_wilxocon, trueDE0, thres[j])
})
re_noiseq <- my.NOISeq(dataset0, conditions2)
fdppow[[5]] <- sapply(1:6, function(j){
dis0 <- which(re_noiseq<=thres[j])
compute_fdppow(dis0, trueDE0)
})
fdppow.i[[5]] <- sapply(1:6, function(j){
compute_fdppow2(re_noiseq, trueDE0, thres[j])
})
return(list(fdppow, fdppow.i))
}, mc.cores = 25)
return(ls_onepair)
})
saveRDS(ls_subsample, paste0(celltype.pairs[j], "subsample.rds"))
}