Skip to content

Commit

Permalink
adds MultiAssayExperiment data generation
Browse files Browse the repository at this point in the history
  • Loading branch information
averissimo committed Jun 22, 2018
1 parent f08ed30 commit 8756ec1
Show file tree
Hide file tree
Showing 12 changed files with 237 additions and 466 deletions.
2 changes: 1 addition & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ vignettes/MANIFEST\.txt
vignettes/Human_genes__GRCh38_p10_\.rda
backup/
GDCdata
^.*\.csv
^.*\.csv
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: skcm.data
Title: Gene expression and clinical data from Melanoma from TCGA.
Version: 2017.11.20
Version: 2018.06.20
Authors@R: person("André", "Veríssimo", email = "[email protected]", role = c("aut", "cre"))
Description: Contains the datasets for SKCM (Melanoma) with gene expression
and clinical data. All was extracted from TCGA.
Expand All @@ -13,6 +13,9 @@ Suggests:
futile.logger,
TCGAbiolinks,
knitr,
rmarkdown
rmarkdown,
Biobase,
SingleCellExperiment,
MultiAssayExperiment
RoxygenNote: 6.0.1
VignetteBuilder: knitr
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(build.assay)
export(getParticipantCode)
export(getSampleTypeCode)
export(joinRNASeqData)
Expand Down
18 changes: 3 additions & 15 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,14 @@ getSampleTypeCode <- function(fullBarcode) {
#'
#' @return a matrix with gene expression levels and all tissue samples
#' @export
#'
#' @examples
joinRNASeqData <- function() {
# load tissue data
data(fpkm.per.tissue)

# iterate on all and join in a single matrix
arrays.in.dat <- sapply(seq(length(tissue)), function(ix){ is.array(tissue[[ix]]) })

out.dat <- c()
for ( ix in which(arrays.in.dat)) {
out.dat <- cbind(out.dat, tissue[[ix]])
for (ix in names(fpkm.per.tissue)) {
out.dat <- cbind(out.dat, fpkm.per.tissue[[ix]])
}
return(out.dat)
}
Expand All @@ -63,20 +59,12 @@ joinRNASeqData <- function() {
#' This cannot be cached in the package as it takes too much space and
#' would be redundant with fpkm.per.tissue data
#'
#' @param project
#' @param workflow.type
#'
#' @return
#' @return a full gdc
#' @export
#'
#' @examples
loadGDCRnaSeq <- function() {
data(gdc)

temp.dat <- joinRNASeqData()
rownames(temp.dat) <- rownames(gdc$rnaseq)
temp.dat <- temp.dat[,colnames(gdc$rnaseq)]

gdc$rnaseq@assays[[1]] <- temp.dat

return(gdc)
Expand Down
84 changes: 84 additions & 0 deletions R/load_assay.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#' Create MultiAssayExperiment object from data
#'
#' @param clinical use custom clinical (that can be pre-processed)
#'
#' @return a MultiAssayExperiment object
#' @export
#'
#' @examples
#' assay <- build.assay()
#' assay[['RNASeq']]
#' assar$vital_status
build.assay <- function(clinical.custom = NULL,
gdc.custom = NULL,
mutation.custom = NULL,
rnaseq.custom = NULL) {
# get clinical data
if (is.null(clinical.custom)) {
data(clinical)
clin <- clinical$all
} else {
clin <- clinical.custom
}

futile.logger::flog.info('Loading \'Biospecimen\' data...')
if (is.null(gdc.custom)) {
data(gdc)
gdc.custom <- gdc
}

# get all RNASeq data
futile.logger::flog.info('Joining \'RNASeq\' data...')
if (is.null(rnaseq.custom)) {
rnaseq.custom <- joinRNASeqData()
}

futile.logger::flog.info('Loading \'Mutation\' data...')
if (is.null(mutation.custom)) {
data(mutation)
mutation.custom <- mutation$count
}

#
# Expression data

# map expression data with clinical
es.map <- data.frame(master = strtrim(colnames(rnaseq.custom), 12),
assay = colnames(rnaseq.custom),
stringsAsFactors = FALSE)

# filter only valid date.. i.e expression that have clinical data
valid.ix <- es.map$master %in% clin$bcr_patient_barcode
valid.dat <- rnaseq.custom[, valid.ix]

sample.barcode <- strtrim(colnames(valid.dat), 16)
valid.codes <- sample.barcode[sample.barcode %in% gdc$bio.sample$bcr_sample_barcode]

temp.df <- Biobase::AnnotatedDataFrame(gdc$bio.sample[valid.codes,])
rownames(temp.df) <- colnames(valid.dat)

# build expression set
es <- Biobase::ExpressionSet(assayData = valid.dat, phenoData = temp.df)

#
# Mutation data
mutation.colnames <- colnames(mutation.custom)
valid.ix <- colnames(mutation.custom) %in% clin$bcr_patient_barcode

mut.map <- data.frame(master = mutation.colnames[valid.ix], assay = mutation.colnames[valid.ix])

mut <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = mutation.custom))

#
# Setup to create MultiAssayExperiment object

futile.logger::flog.info('Building Assay...')
listmap <- list(es.map, mut.map)
names(listmap) <- c("RNASeq", "Mutation")

dfmap <- MultiAssayExperiment::listToMap(listmap)
objlist <- list("RNASeq" = es, "Mutation" = mut)
my.assay <- MultiAssayExperiment::MultiAssayExperiment(objlist, clin, dfmap)

return(my.assay)
}
474 changes: 31 additions & 443 deletions README.html

Large diffs are not rendered by default.

70 changes: 70 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
TCGA.DATA R Package
================

- [Package information](#package-information)
- [How to use the dataset](#how-to-use-the-dataset)
- [How to build own data package](#how-to-build-own-data-package)
- [Ackowledgements](#ackowledgements)

This R Package allows to retrieve Gene Expression, Mutation and clinical data from [TCGA database](http://gdc-portal.nci.nih.gov/) (The Cancer Genome Atlas). It retrieves a single type of cancer at a time.

We publish diferent package in the [releases page](https://github.com/averissimo/tcga.data/releases) that allow to quickly use the datasets.

The genome expression datasets are already in a matrix format ready to be used. The data is in FPKM (Fragments Per Kilobase Million) format. Any additional normalization to use in models must be performed

Package information
-------------------

### How to use the dataset

1. Install `brca.data` by using `devtools` package. (`brca.data`, `prad.data` or `skcm.data`)

2. Load the library

3. Load the required datasets (one or more of the following)
- `clinical`
- `fpkm.per.tissue`
- `fpkm.per.tissue.barcode`
- `mutation`
- `gdc`

#### Example for BRCA package

``` r
# install the devtooks library
install.packages('devtools')
# The library can also be loaded and use the function install_git without 'devtools::' prefix
devtools::install_url('https://github.com/averissimo/tcga.data/releases/download/2016.12.15-brca/brca.data_1.0.tar.gz')
#
# Load the brca.data package
library(brca.data)
# start using the data, for example the tissue data
data(fpkm.per.tissue)
# tissue is now in the enviromnet and will be loaded on the first
# time it is used. For example:
names(fpkm.per.tissue)
```

How to build own data package
-----------------------------

1. Open vignettes/build\_data.Rmd
2. Change in the header of the Rmd *(beginning of the document)* the project param to the target TCGA project
3. Open DESCRITION and change the name of the package to the desired name

- we use a convention of \#\#\#\#.data where \#\#\#\# is the tcga project name in lowercase

1. Run the vignettes/build\_data.Rmd to build the cache of the data
2. Run `devtools::document()` to create documentation
3. Run `devtools::build()` to build the actual package

Ackowledgements
---------------

This package was developed primarily by *[André Veríssimo](http://web.tecnico.ulisboa.pt/andre.verissimo/)* with support from *Marta Lopes* and *[Susana Vinga](http://web.tecnico.ulisboa.pt/susanavinga/)*

This work was supported by:

- [FCT](www.fct.pt), through IDMEC, under LAETA, projects *(UID/EMS/50022/2013)*;
- Susana Vinga acknowledges support by program Investigador FCT *(IF/00653/2012)* from [FCT](www.fct.pt), co-funded by the European Social Fund *(ESF)* through the Operational Program Human Potential *(POPH)*;
- André Veríssimo acknowledges support from [FCT](www.fct.pt) *(SFRH/BD/97415/2013)*.
23 changes: 23 additions & 0 deletions man/build.assay.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/gdc.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/gene.ranges.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/loadGDCRnaSeq.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 15 additions & 1 deletion vignettes/build_data.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -376,14 +376,27 @@ ggplot(data = mutations.by.case) +
scale_y_continuous(breaks = c(1, 10, 100, 1000, 10000), trans = 'log10')
```

## MultiAssayExperiment

Builds a multiAssayExperiment with clinical, rnaseq and mutation count data

```{r, eval=FALSE}
source('../R/load_assay.R')
multiAssay <- build.assay(clinical.custom = clinical$all,
gdc.custom = gdc,
mutation.custom = mutation$count,
rnaseq.custom = fpkm.per.tissue$all)
```


## Exported data

- `fpkm.per.tissue`: gene expression data for all types of tissues, see `names(tissue)`;
- `fpkm.per.tissue.barcode`: Patient's participation data code (TCGA-XX-XXXX) per type of tissue;
- `clinical`: Clinical data per tissue type. Has the same structure as tissue.
- `mutation`: Mutation data from GDC (filtered) and a matrix of counts.

```{r export_data, include=FALSE}
```{r export_data, include=FALSE, eval=FALSE}
# Extract all expression data to a different variable to remove redundancy in `tissue`
fpkm.per.tissue$all <- NULL
gdc$rnaseq <- 'Use function loadGDCRnaSeq to get the original assay'
Expand All @@ -397,6 +410,7 @@ devtools::use_data(fpkm.per.tissue.barcode, overwrite = TRUE)
devtools::use_data(clinical, overwrite = TRUE)
devtools::use_data(gene.ranges, overwrite = TRUE)
devtools::use_data(mutation, overwrite = TRUE)
#devtools::use_data(multiAssay, overwrite = TRUE)
```

```{r, include=FALSE, eval=FALSE}
Expand Down

0 comments on commit 8756ec1

Please sign in to comment.