documentation update.

mahmoudibrahim · Jun 24, 2019 · 8b05403 · 8b05403
1 parent 0356d88
commit 8b05403
Show file tree

Hide file tree

Showing 22 changed files with 272 additions and 244 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: genesorteR
 Type: Package
 Title: Feature Ranking in Clustered Single Cell Data
-Version: 0.1.5
+Version: 0.2.0
 Author: Mahmoud M Ibrahim
 Maintainer: Mahmoud M Ibrahim <[email protected]>
-Description: The main purpose of this R extension is to select features in (possibly very large) single cell data including scRNA-Seq and scATAC-Seq.
+Description: The main purpose of this R extension is to select features in (possibly very large) single cell data including scRNA-Seq and potentially scATAC-Seq.
     The main idea is that the dropout rate of a gene is a good measure of its expression, and that empirical statistics calculated based on binarized expression matrices are sufficient to select marker genes in a way that is consistent with the expected definition of "marker gene" in experimental biology research.
-    It can also provide a ranking of genes (or in fact whatever features present in the data) specificity to each cell cluster, as well as select large or small sets of marker genes by a permutation test or using entropy-based feature selection.
+    It can provide a ranking of genes specificity in each cell cluster, as well as select large or small sets of marker genes by a permutation test or using entropy-based feature selection.
     To assess cell clustering quality, some functions can also compute cell cluster quality metrics.
 License: GPL-3 + file LICENSE
 Encoding: UTF-8

diff --git a/R/getClassAUC.R b/R/getClassAUC.R
@@ -11,7 +11,7 @@
 #' relatively small number of genes that have a very high specificity score. Top
 #' marker genes for a  cluster that is poorly separated from other cell
 #' clusters will have average or low specificity scores. Sorting the genes for
-#' each cell cluster by their specificity score and plotting the scaled scores
+#' each cell cluster by their specificity scores and plotting the scaled scores
 #' in order creates a curve that should be far from the diagonal for
 #' well-separated clusters but close to the diagonal for poorly-separated
 #' clusters. The AUC of this curve can be used to quantify this intuition and
@@ -32,28 +32,30 @@
 #' @export
 #' @author Mahmoud M Ibrahim <[email protected]>
 #' @examples
-#' #randomly generated cell clusters
+#' #randomly generated expression matrix and cell clusters
 #' set.seed(1234)
 #' exp = matrix(sample(0:20,1000,replace=TRUE), ncol = 20)
 #' rownames(exp) = sapply(1:50, function(x) paste0("g", x))
 #' cellType = sample(c("cell type 1","cell type 2"),20,replace=TRUE)
 #' sg = sortGenes(exp, cellType)
 #' classAUC = getClassAUC(sg)
-#' mean(classAUC) #overall clustering quality
 #'
-#' #"reasonably" separated clusters, with a few clear markers
+#' #"reasonably" separated clusters
 #' data(sim)
 #' sg = sortGenes(sim$exp, sim$cellType)
 #' classAUC = getClassAUC(sg)
-#' mean(classAUC)
 #'
 #' #real data with three well separated clusters
 #' data(kidneyTabulaMuris)
 #' sg = sortGenes(kidneyTabulaMuris$exp, kidneyTabulaMuris$cellType)
 #' classAUC = getClassAUC(sg)
-#' mean(classAUC)
-getClassAUC = function(gs, markers = NULL, plotCurves = TRUE, colors = c("#d64e3c7F","#7dd5547F","#8641c67F","#cfca457F","#7778cb7F","#59803d7F","#d04d9c7F","#73d6a87F","#492f607F","#ccc4977F","#7f343b7F","#72acc07F","#b97d407F","#c796b57F","#45483a7F", "#A020F07F","#00FF007F","#FFFF007F")) {
+getClassAUC = function(gs, markers = NULL, plotCurves = TRUE, colors = NULL) {
 
+
+	if (is.null(colors)) {
+		colors = c("#d64e3c7F","#7dd5547F","#8641c67F","#cfca457F","#7778cb7F","#59803d7F","#d04d9c7F","#73d6a87F","#492f607F","#ccc4977F","#7f343b7F","#72acc07F","#b97d407F","#c796b57F","#45483a7F", "#A020F07F","#00FF007F","#FFFF007F")
+	}
+
 	auc = rep(0, length = ncol(gs$specScore))
 	if (is.null(markers)) {
 		for (i in 1:ncol(gs$specScore)) {

diff --git a/R/getMarkers.R b/R/getMarkers.R
@@ -4,41 +4,40 @@
 #' small set of marker genes. 
 #'
 #'
-#' getMarkers relies on calculating an entropy-like metric (called here Shannon
-#' Index) calculated on the scaled gene-cluster specificity score. The intuition
-#' is that genes with low entropy are either lowly expressed or highly specific
-#' to one or few cell clusters. Therefore, we select the top n\% of genes
-#' according to the scaled specificity score and then cluster those genes based
-#' on their Shannon Index. \code{n} is controlled by \code{quant}, the default
-#' value 0.99 means top 1% of genes. Using lower values up to 0.95 may be ok but 
-#' lower values than 0.95 are not recommended nor necessary. It can be increased
-#' to 0.999 for example to obtain a smaller set of genes. Scaling the specificity 
-#' score for each cluster is done to try to guarantee that markers for each 
-#' cluster will eventually be selected even if the cluster is not absolutely
-#' well separated.
+#' getMarkers relies on calculating an entropy-like metric (called here the Gene 
+#' Specificity Shannon Index) calculated on the scaled gene-cluster specificity 
+#' score. The intuition is that genes with low Shannon Index are either lowly 
+#' expressed or highly specific to one or few cell clusters. Therefore, we select
+#' the top n\% of genes according to the scaled specificity score and then cluster 
+#' those genes based on their Shannon Index into genes with high Shannon Index 
+#' and genes with low Shannon Index. \code{n} is controlled by \code{quant}, 
+#' the default value 0.99 means top 1 \code{%} of genes. Using lower values up to 
+#' 0.95 may be ok but lower values than 0.95 are not recommended nor necessary. It 
+#' can be increased to 0.999 for example to obtain a smaller set of genes, especially
+#' for data with many cell clusters. Scaling the specificity score for each cluster
+#' is done to try to guarantee that markers for each cluster will eventually be 
+#' selected even if the cluster is not absolutely well-separated.
 #'
 #' It can also return the mutual information between each gene and cell clusters, 
 #' as well as a Cluster Shannon Index, indicating how well separated cell clusters are 
-#' from each other.
-#' Cluster Shannon Index is calculated on the scaled specificity score of selected 
-#' marker genes for each cluster. The intuition is that, when restricted to top marker genes, 
-#' well-defined clusters will have few high scoring genes (lower Shannon Index).
+#' from each other. Cluster Shannon Index is calculated on the scaled specificity score 
+#' of selected marker genes for each cluster. The intuition is that, when restricted to
+#' top marker genes, well-defined clusters will have few high scoring genes 
+#' (low Cluster Shannon Index).
 #'
-#' @param gs A list containing \code{$specScore} sparse matrix, \code{$binary}
-#'   expression sparse matrix and \code{$inputClass} vector. Typically the output of
-#'   \code{sortGenes()}.
-#' @param quant A number greater than zero and smaller than one. 0.95 by
+#' @param gs The output of \code{sortGenes()}.
+#' @param quant A number greater than zero and smaller than one. 0.99 by
 #'   default. See Details.
 #' @param mutualInfo Logical. If \code{TRUE}, the mutual information between
-#'   gene expression and cell class will be returned for each gene. \code{FALSE}
+#'   gene expression and cell clustering will be returned for each gene. \code{FALSE}
 #'   by default.
 #' @param classEnt Logical. If \code{TRUE}, a "Cluster Shannon Index" is
 #'   returned for each cluster. See Details. \code{FALSE} by default.
 #' @return \code{getMarkers} returns a list with the following components:
 #'   \item{markers}{A character vector containing the names of selected marker
 #'   features.} \item{maxScaledSpecScore}{A numeric vector of length
-#'   \code{nrow(specScore)}, including the maximum scaled specificity score for
-#'   each gene across all cell clusters.} \item{gene_entropy}{A numeric vector
+#'   \code{nrow(gs$specScore)}, including the maximum scaled specificity score for
+#'   each gene across all cell clusters.} \item{gene_shannon_index}{A numeric vector
 #'   of the same length as \code{maxScaledSpecScore}, including the Gene Shannon
 #'   Index for each gene. See Details.} \item{mutInfo}{A numeric vector of the
 #'   same length as \code{maxScaledSpecScore}, including the mutual information
@@ -49,7 +48,7 @@
 #' @author Mahmoud M Ibrahim <[email protected]>
 #' @export
 #' @examples
-#' #randomly generated cell clusters, almost no markers are found
+#' #randomly generated data and cell clusters, almost no markers are found
 #' set.seed(1234)
 #' exp = matrix(sample(0:20,1000,replace=TRUE), ncol = 20)
 #' rownames(exp) = sapply(1:50, function(x) paste0("g", x))
@@ -61,18 +60,20 @@
 #'
 #' #"reasonably" separated clusters, with a few clear markers
 #' data(sim)
-#' sg = sortGenes(sim$exp, sim$cellType)
-#' mm = getMarkers(sg,quant=0.95)
+#' gs = sortGenes(sim$exp, sim$cellType)
+#' mm = getMarkers(gs,quant=0.95)
 #' length(mm$markers)
 #'
 #' #real data with three well separated clusters
 #' data(kidneyTabulaMuris)
-#' sg = sortGenes(kidneyTabulaMuris$exp, kidneyTabulaMuris$cellType)
-#' mm = getMarkers(sg)
-#' length(mm$markers) #we found 450 candidate markers
+#' gs = sortGenes(kidneyTabulaMuris$exp, kidneyTabulaMuris$cellType)
+#' mm = getMarkers(gs, quant = 0.99)
+#' length(mm$markers) #we found 109 candidate markers
 #' #we want to get a more focused list:
-#' mm = getMarkers(sg, quant = 0.999)
+#' mm = getMarkers(gs, quant = 0.999)
 #' length(mm$markers) #11 genes that can alone descriminate between the cell types
+#' @seealso
+#' getPValues
 getMarkers = function(gs, quant = 0.99, mutualInfo = FALSE, classEnt = FALSE) {
 
   scored = apply(gs$specScore, 2, score)
@@ -93,6 +94,6 @@ getMarkers = function(gs, quant = 0.99, mutualInfo = FALSE, classEnt = FALSE) {
     names(classEntropy) = colnames(gs$specScore)
   }
 
-  return(list(gene_entropy = ent, maxScaledSpecScore = maxi, markers = markers, mutInfo = mutInfo, classEntropy = classEntropy))
+  return(list(gene_shannon_index = ent, maxScaledSpecScore = maxi, markers = markers, mutInfo = mutInfo, classEntropy = classEntropy))
 
 }
diff --git a/R/getPValues.R b/R/getPValues.R
@@ -4,14 +4,14 @@
 #' score to obtain a p-value value on each gene-cell type specificity score. The
 #' permutation keeps everything the same except that cell type assignments are
 #' permuted between the cells (but note that cell type proportions are also kept
-#' the same). This function can be a way to select all differentially expressed
-#' genes between all cell classes globally in a dataset in one go, but it can
-#' also perform "differential expression" locally between two cell types.
+#' the same). This function is a way to select all differentially expressed
+#' genes between all cell classes globally in a dataset in one go or to define 
+#' differentially expressed genes in a specific cell cluster.
 #'
-#' @param gs A list, typically the output of \code{sortGenes()}.
+#' @param gs The output of \code{sortGenes()}.
 #' @param numPerm The number of permutations to do. The default value 5 is to
-#'   ensure reasonable running time for large datasets but might be advisable to
-#'   increase it in many cases.
+#'   ensure quick running time for very large datasets but can be increased to
+#'   increase the power of the permutation test, see Details.
 #' @param correctMethod The method used to correct p-values for multiple
 #'   hypothesis testing. Any valid input to "method" in \code{p.adjust} is
 #'   allowed. p-value correction is done on a gene-by-gene basis.
@@ -22,7 +22,7 @@
 #'   one cell from each of the cell clusters contained in \code{$specScore}.
 #'   Note this option is still experimental and might not give consistent
 #'   results.
-#' @param cores A number greater than zero (1 by default) that indicates how
+#' @param cores An integer greater than zero (1 by default) that indicates how
 #'   many cores to use for parallelization using mclapply.
 #' @param seed The seed for random permutations.
 #'
@@ -34,18 +34,20 @@
 #'   that indicates the starting column for each performed permutation in
 #'   \code{permuteVal}} \item{pval}{A matrix with as many rows as genes and
 #'   columns as \code{ncol($specScore)}, containing the p-values for the null
-#'   hypothesis that the gene is equally specific to all cell clusters is true.}
+#'   hypothesis that the gene is not highly specific to a cell cluster is true.}
 #'   \item{adjpval}{A matrix with the same size as \code{pval}, containing the
 #'   corrected p-values using the method specified in \code{correctMethod}}
 #' @export
 #' @author Mahmoud M Ibrahim <[email protected]>
 #' @examples
-#' data(sim)
-#' sg = sortGenes(sim$exp, sim$cellType)
-#' pp = getPValues(sg)
-#'
+#' data(kidneyTabulaMuris)
+#' gs = sortGenes(kidneyTabulaMuris$exp, kidneyTabulaMuris$cellType)
+#' pp = getPValues(gs)
 #' #obtain genes that are "differentially expressed" in at least one cluster
 #' markers = names(which(apply(pp$adjpval, 1, function(x) any(x < 0.01))))
+#'
+#' @seealso
+#' getMarkers
 getPValues = function(gs, numPerm = 5, correctMethod = "BH", testGenes = NULL, subsetCells = NULL, cores = 1, seed = 111) {
 
 	ngroup = ncol(gs$specScore)

diff --git a/R/kidneyTabulaMuris.R b/R/kidneyTabulaMuris.R
@@ -1,9 +1,9 @@
-#' A subset of the Tabula Muris SmartSeq data, restricted to three cell types from Kidney.
+#' A subset of the Tabula Muris SmartSeq2 data, restricted to three cell types from Kidney.
 #'
 #' A subset of the Tabula Muris data with 215 cells and 23341 genes. There are three
 #' cell types, endothelial cells: 122 cells, kidney collecting duct epithelial cell: 78 cells and leukoocytes 15 cells.
 #'
 #' @format A list with two components:
 #' \describe{\item{exp}{Gene expression sparse matrix of class dgCMatrix, with normalized log expression values. Rows are genes, columns are cells.} \item{cellType}{A character vector containing the cell types.}}
-#' @references Schaum et al. Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris. Nature, 562:7727, 2018
+#' @references The Tabula Muris Consortium Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris. Nature, 562:7727, 2018
 "kidneyTabulaMuris"
diff --git a/R/plotCorrelationHeat.R b/R/plotCorrelationHeat.R
@@ -1,10 +1,9 @@
 #' plotCorrelationHeat
 #'
-#' \code{plotCorrelationHeat} uses the specificity scores to correlate cell
-#' clusters with each other and plot a heatmap.
+#' \code{plotCorrelationHeat} uses the specificity scores returned by \code{sortGenes} to 
+#' correlate cell clusters with each other and plot a heatmap.
 #'
-#' @param gs A list containing \code{$specScore} sparse matrix. Typically, the
-#'   output of \code{sortGenes}.
+#' @param gs The output of \code{sortGenes}.
 #' @param markers Restrict correlation analysis to those genes. A character
 #'   vector.
 #' @param corMethod Correlation method, will passed to \code{method} in the
@@ -14,20 +13,21 @@
 #' returned? FALSE by default.
 #' @param displayNumbers Should correlation values be displayed on the heatmap?
 #'   TRUE by default.
-#' @return If \code{outs} is TRUE, the pheatmap object will be returned.
+#' @return If \code{outs} is TRUE, the pheatmap object and the correlation matrix will 
+#' be returned.
 #' @export
 #' @author Mahmoud M Ibrahim <[email protected]>
 #' @examples
 #' data(kidneyTabulaMuris)
-#' sg = sortGenes(kidneyTabulaMuris$exp, kidneyTabulaMuris$cellType)
-#' plotCorrelationHeat(sg)
+#' gs = sortGenes(kidneyTabulaMuris$exp, kidneyTabulaMuris$cellType)
+#' plotCorrelationHeat(gs)
 #'
 #' #user only marker genes and spearman correlation
-#' mm = getMarkers(sg, quant = 0.95)
-#' plotCorrelationHeat(sg, markers = mm$markers, corMethod = "spearman")
+#' mm = getMarkers(gs, quant = 0.95)
+#' plotCorrelationHeat(gs, markers = mm$markers, corMethod = "spearman")
 #'
-#' #do not plot correlation values
-#' plotCorrelationHeat(sg, markers = mm$markers, corMethod = "spearman", displayNumbers = FALSE)
+#' #do not write correlation values, useful if there are many cell clusters
+#' plotCorrelationHeat(gs, markers = mm$markers, corMethod = "spearman", displayNumbers = FALSE)
 plotCorrelationHeat = function(gs, markers = NULL, corMethod = "pearson", colors = colorRampPalette(rev(c("orangered4","orangered","gray90","dodgerblue","dodgerblue4")))(n=100), outs = FALSE, displayNumbers = TRUE) {
 
   if (is.null(markers)) {