-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_loadData.ALL.R
executable file
·118 lines (82 loc) · 3.76 KB
/
01_loadData.ALL.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#############################################################################
### Loads methylation and expression datasets and returns a
### data frame for each of these data types
#############################################################################
library(parallel)
library(data.table)
args <- commandArgs(TRUE)
dataset <- args[1]
## adapt path to dataset
main.dir <- '/icgc/dkfzlsdf/analysis/B080/steiger'
data.dir <- file.path(main.dir,'data',dataset)
out.dir <- file.path(data.dir,'output')
meth.dir <- file.path(data.dir)
file.dir <- paste('gdac.broadinstitute.org_',dataset,'.mRNAseq_Preprocess.Level_3.2015020400.0.0',sep="")
exp.dir <- file.path(data.dir,file.dir)
r.dir <- file.path(main.dir,"scripts")
dir.create(out.dir)
dir.create(paste(out.dir,"/RData",sep=""))
#source(file.path(r.dir,'00_GeneralFunctions.R'))
## A . READ METHYLATION DATA
message(paste("Reading meth-data ",dataset))
Mheader <- read.table(file.path(meth.dir,'methylation450_NArm.txt'),stringsAsFactors = F,
sep="\t",header=T,row.names=1,fill=TRUE,nrow=1,dec = ".")
M5rows <- read.table(file.path(meth.dir,'methylation450_NArm.txt'),stringsAsFactors = F,
sep="\t",header=F,row.names=1,skip=2,fill=TRUE,nrow=5,dec = ".")
classes <- sapply(M5rows,class)
M <- read.table(file.path(meth.dir,'methylation450_NArm.txt'),
sep="\t",header=F,row.names=1,fill=TRUE,skip=2,
#nrows=1000,
dec=".",comment.char = "",stringsAsFactors = F,
colClasses=classes)
message("Done")
M.id <- colnames(Mheader)
M.id <- sapply(lapply(strsplit(M.id,"\\."),'[',1:4),function(x) {y <- paste0(x,collapse="."); gsub("[A-Z]$","",y)})
colnames(M) <- M.id
## B. READ EXPRESSION DATA
message("Reading Expression-Data")
mRNAseq <- paste(dataset,'.uncv2.mRNAseq_RSEM_normalized_log2_PARADIGM.txt',sep="")
Xheader <- read.table(file.path(exp.dir,mRNAseq),stringsAsFactors = F,
sep="\t",header=T,row.names=1,fill=TRUE,nrow=1,dec = ".")
X5rows <- read.table(file.path(exp.dir,mRNAseq),stringsAsFactors = F,
sep="\t",header=F,row.names=1,skip=2,fill=TRUE,nrow=5,dec = ".")
classes <- sapply(X5rows,class)
X <- read.table(file.path(exp.dir,mRNAseq),
sep="\t",header=F,row.names=1,fill=TRUE,skip=1,
#nrows=1000,
dec=".",comment.char = "",stringsAsFactors = F,
colClasses=classes)
colnames(X) <- colnames(Xheader)
i.na <- which(apply(X,1,function(x) {sum(!is.na(x))==0}))
X <- X[-i.na,]
X.id <- colnames(X)
## build correspondance table txid <-> geneid
gid <- sapply(strsplit(rownames(X),"\\|"),'[[',2)
rownames(X) <- gid
## SELECTING COMMON SAMPLES
common <- intersect(X.id,M.id)
i.m <- match(common,M.id)
i.x <- match(common,X.id)
X.entity <- X[,i.x]
M.entity <- M[,i.m]
i.cg <- grep("cg",rownames(M.entity))
M.entity <- M.entity[i.cg,]
message('Computing median and sd ....')
STAT.entity <- data.frame(sd=apply(M.entity,1,sd,na.rm=T),
iqr=apply(M.entity,1,IQR,na.rm=T),
med=apply(M.entity,1,median,na.rm=T),
mean=apply(M.entity,1,mean,na.rm=T))
## rank CpGs by decreasing variability (ost variable on top)
STAT.entity <- STAT.entity[order(STAT.entity$iqr,decreasing=T),]
selected.cg <- rownames(STAT.entity)[1:100000]
M.entity <- M.entity[selected.cg,]
# saveRDS(X.entity,paste(out.dir,"/RData/","X.",dataset,".rds",sep=""),compress='bzip2')
# saveRDS(M.entity,paste(out.dir,"/RData/","M.",dataset,".rds",sep=""),compress='bzip2')
# saveRDS(STAT.M,paste(out.dir,"/RData/","STAT.",dataset,".rds",sep=""),compress='bzip2')
save('X.entity',
'M.entity',
'STAT.entity',
file=file.path(out.dir,paste('RData/',dataset,'.load.RData',sep="")),
compress='bzip2')
rm()
gc()