-
Notifications
You must be signed in to change notification settings - Fork 1
/
extracting data for predictions.R
72 lines (61 loc) · 4.54 KB
/
extracting data for predictions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
top20 <- rownames(counts_BRCA_top20_train)
samples <- c(ANs,Ts)
mmatrix_BRCA_BODY_top20_train <- matrix(ncol=length(samples),nrow=20,data=0)
colnames(mmatrix_BRCA_BODY_top20_train) <- samples
row.names(mmatrix_BRCA_BODY_top20_train) <- top20
for (i in 1:20) {
IDs_body <- unique(c(eval(parse(text = paste('GENEBODYInd$SID$','"',top20[i],'"',sep=""))), eval(parse(text = paste('UTR3Ind$SID$','"',top20[i],'"',sep="")))))
temp = mmatrix_pc[IDs_body,samples]
if (length(IDs_body)==1) {mmatrix_BRCA_BODY_top20_train[i, ] = temp} else {mmatrix_BRCA_BODY_top20_train[i, ] = apply(temp, 2, eval(mean), na.rm = TRUE)}
}
mmatrix_BRCA_PROMOTER_top20_train <- matrix(ncol=length(samples),nrow=20,data=0)
colnames(mmatrix_BRCA_PROMOTER_top20_train) <- samples
row.names(mmatrix_BRCA_PROMOTER_top20_train) <- top20
for (i in 1:20) {
IDs_promoter <- unique(c(eval(parse(text = paste('TSS1500Ind$SID$','"',top20[i],'"',sep=""))),eval(parse(text = paste('TSS200Ind$SID$','"',top20[i],'"',sep=""))), eval(parse(text = paste('UTR5Ind$SID$','"',top20[i],'"',sep="")))),eval(parse(text = paste('EXON1Ind$SID$','"',top20[i],'"',sep=""))))
temp = mmatrix_pc[IDs_promoter,samples]
if (length(IDs_promoter)==1) {mmatrix_BRCA_PROMOTER_top20_train[i, ] = temp} else {mmatrix_BRCA_PROMOTER_top20_train[i, ] = apply(temp, 2, eval(mean), na.rm = TRUE)}
}
mmatrix_BRCA_PROMOTER_top20_train[which(mmatrix_BRCA_PROMOTER_top20_train < -7)] <- -7
mmatrix_BRCA_PROMOTER_top20_train[which(mmatrix_BRCA_PROMOTER_top20_train > 7)] <- 7
mmatrix_BRCA_BODY_top20_train[which(mmatrix_BRCA_BODY_top20_train < -7)] <- -7
mmatrix_BRCA_BODY_top20_train[which(mmatrix_BRCA_BODY_top20_train > 7)] <- 7
samples <- c(ANs_toPredict,Ts_toPredict)
mmatrix_BRCA_BODY_top20_predict <- matrix(ncol=length(samples),nrow=20,data=0)
colnames(mmatrix_BRCA_BODY_top20_predict) <- samples
row.names(mmatrix_BRCA_BODY_top20_predict) <- top20
for (i in 1:20) {
IDs_body <- unique(c(eval(parse(text = paste('GENEBODYInd$SID$','"',top20[i],'"',sep=""))), eval(parse(text = paste('UTR3Ind$SID$','"',top20[i],'"',sep="")))))
temp = mmatrix_pc[IDs_body,samples]
if (length(IDs_body)==1) {mmatrix_BRCA_BODY_top20_predict[i, ] = temp} else {mmatrix_BRCA_BODY_top20_predict[i, ] = apply(temp, 2, eval(mean), na.rm = TRUE)}
}
mmatrix_BRCA_PROMOTER_top20_predict <- matrix(ncol=length(samples),nrow=20,data=0)
colnames(mmatrix_BRCA_PROMOTER_top20_predict) <- samples
row.names(mmatrix_BRCA_PROMOTER_top20_predict) <- top20
for (i in 1:20) {
IDs_promoter <- unique(c(eval(parse(text = paste('TSS1500Ind$SID$','"',top20[i],'"',sep=""))),eval(parse(text = paste('TSS200Ind$SID$','"',top20[i],'"',sep=""))), eval(parse(text = paste('UTR5Ind$SID$','"',top20[i],'"',sep="")))),eval(parse(text = paste('EXON1Ind$SID$','"',top20[i],'"',sep=""))))
temp = mmatrix_pc[IDs_promoter,samples]
if (length(IDs_promoter)==1) {mmatrix_BRCA_PROMOTER_top20_predict[i, ] = temp} else {mmatrix_BRCA_PROMOTER_top20_predict[i, ] = apply(temp, 2, eval(mean), na.rm = TRUE)}
}
mmatrix_BRCA_PROMOTER_top20_predict[which(mmatrix_BRCA_PROMOTER_top20_predict < -7)] <- -7
mmatrix_BRCA_PROMOTER_top20_predict[which(mmatrix_BRCA_PROMOTER_top20_predict > 7)] <- 7
mmatrix_BRCA_BODY_top20_predict[which(mmatrix_BRCA_BODY_top20_predict < -7)] <- -7
mmatrix_BRCA_BODY_top20_predict[which(mmatrix_BRCA_BODY_top20_predict > 7)] <- 7
cpm_BRCA_top20_predict <- counts_BRCA_plusOne_all[top20,samples]
for (i in 1:ncol(cpm_BRCA_top20_predict)) cpm_BRCA_top20_predict[,i] <- cpm_BRCA_top20_predict[,i]/factors_ls[i]
# data for predictions using GLMs
save(mmatrix_BRCA_PROMOTER_top20_predict,mmatrix_BRCA_BODY_top20_predict,cpm_BRCA_top20_predict,ANs_toPredict,Ts_toPredict,samples,file="predictData.RData")
# data for predictions using our PGMs
# find constituent probes
IDs <- NULL
for (i in 1:100) {
IDs_body <- unique(c(eval(parse(text = paste('GENEBODYInd$SID$','"',top100[i],'"',sep=""))), eval(parse(text = paste('UTR3Ind$SID$','"',top100[i],'"',sep="")))))
IDs_promoter <- unique(c(eval(parse(text = paste('TSS1500Ind$SID$','"',top100[i],'"',sep=""))),eval(parse(text = paste('TSS200Ind$SID$','"',top100[i],'"',sep=""))), eval(parse(text = paste('UTR5Ind$SID$','"',top100[i],'"',sep="")))),eval(parse(text = paste('EXON1Ind$SID$','"',top100[i],'"',sep=""))))
IDs <- c(IDs,IDs_body,IDs_promoter)
}
IDs <- unique(IDs)
mmatrix_pc_top20 <- mmatrix_pc[IDs,samples]
mmatrix_pc_top20[which(mmatrix_pc_top20 < -7)] <- -7
mmatrix_pc_top20[which(mmatrix_pc_top20 > 7)] <- 7
counts_BRCA_top20 <- counts_BRCA_plusOne_all[top20,samples]
save(mmatrix_pc_top20,counts_BRCA_top20,ANs_toPredict,Ts_toPredict,factors_ls,file="BRCA_predictionData.RData")