CPTR6_analysis_markdown.Rmd

---
title: "Annunziata DSP Analysis for Protein Panel"
output: html_document
date: "2024-02-13"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# Increase the time out for downloading the DSP package
options(timeout = max(300, getOption("timeout")))
```

## Install DSP Workflow from github

```{r}

#install.packages("devtools")

library(devtools)

#install_github("NIDAP-Community/DSPWorkflow", ref = "dev")
```

## Load the test data

``` {r Load Data, echo=TRUE}
# Create a folder to hold the test data. 
# Below we have designated the folder 'test_data'

# Set paths for downloading dcc files
#downloads.path <- "test_data/Human_Kidney/downloaded/"
#tar.file.name <- "kidney_dccs.tar.gz"
#full.tar.path <- paste0(downloads.path,tar.file.name)

# Check if dcc files were previously downloaded
#if (!file.exists(full.tar.path)) {
  
  # Download dcc files and place in data folder
#  data.url <- "http://hpc.nih.gov/~CCBR/DSPWorkflow/kidney_dccs.tar.gz"
#  download.file(data.url, full.tar.path)
#  untar(full.tar.path, exdir = downloads.path)
#}

dcc.files <- dir(
  file.path("dccs"),
  pattern = ".dcc$",
  full.names = TRUE,
  recursive = TRUE
)

pkc.files <- c("Mm_P_NGS_Core_v1.0.pkc",
               "Mm_P_NGS_ImmuneActivation_v1.0.pkc", 
               "Mm_P_NGS_ImmuneCellTyping_v1.0.pkc", 
               "Mm_P_NGS_Myeloid_v1.0.pkc")
pheno.data.file <- "CPTR6_Annunziata_annotation.xlsx"
```

# Run the first step of the DSP Workflow package

# Study Design

```{r Study Design, echo=TRUE}

library(DSPWorkflow)

# Save the output from the study design function into a list
sdesign.list <- studyDesign(dcc.files = dcc.files, 
                                pkc.files = pkc.files,
                                pheno.data.file = pheno.data.file,
                                pheno.data.sheet = "Annotation template",
                                pheno.data.dcc.col.name = "Sample_ID",
                                protocol.data.col.names = c("aoi", "roi"),
                                experiment.data.col.names = c("panel"),
                                slide.name.col = "slide name", 
                                class.col = "class", 
                                region.col = "region", 
                                segment.col = "segment",
                                area.col = "area",
                                nuclei.col = "nuclei", 
                                sankey.exclude.slide = FALSE, 
                                segment.id.length = 4)

# The output of the study design function is a Geomxset Object and a Plot
# Print out a summary of the object

print(sdesign.list$object)

# Print out the Sankey Plot

print(sdesign.list$object)
```

## 2. QC Preprocessing:

```{r QC Preprocessing, echo=TRUE}

qc.output <-  qcProc(object = sdesign.list$object,
                        min.segment.reads = 1000, 
                        percent.trimmed = 80,    
                        percent.stitched = 80,   
                        percent.aligned = 75,    
                        percent.saturation = 50, 
                        min.negative.count = 1,   
                        max.ntc.count = 9000,     
                        min.nuclei = 20,         
                        min.area = 1000,
                        print.plots = TRUE)
    print(qc.output$segments.qc)
```

## 3. Filtering:

```{r Filtering, echo=TRUE}
  
    goi <- c("PDCD1", 
             "CD274", 
             "IFNG", 
             "CD8A", 
             "CD68", 
             "EPCAM", 
             "KRT18", 
             "NPHS1", 
             "NPHS2", 
             "CALB1", 
             "CLDN8")
    
    filtering.output <- filtering(object = qc.output$object,
                            loq.cutoff = 2, 
                            loq.min = 2, 
                            segment.gene.rate.cutoff = 0.1,
                            study.gene.rate.cutoff = 0.1, 
                            sankey.exclude.slide = FALSE, 
                            goi = goi)
    
    print(filtering.output$`stacked.bar.plot`)
    print(filtering.output$`segment.table`)
    print(filtering.output$`sankey.plot`)
    print(filtering.output$`genes.detected.plot`)
    print(filtering.output$'goi.table', row.names = FALSE)
```


## 4. Normalization:
  
```{r Normalization, echo=TRUE}
  
    q3.normalization.output <- geomxNorm(
                                  object = filtering.output$object, 
                                  norm = "q3")
    
    print(q3.normalization.output$multi.plot)
    print(q3.normalization.output$boxplot.raw)
    print(q3.normalization.output$boxplot.norm)
    
    neg.normalization.output <- geomxNorm(
                                  object = filtering.output$object, 
                                  norm = "neg")
    
    print(neg.normalization.output$multi.plot)
    print(neg.normalization.output$boxplot.raw)
    print(neg.normalization.output$boxplot.norm)
```


## 5. Unsupervised Analysis:

```{r Unsupervised Analysis, echo=TRUE}

    #Test Unsupervised Analysis:
    unsupervised.output <- dimReduct(object = q3.normalization.output$object,
                        point.size = 3,
                        point.alpha = 1,
                        color.variable1 = "region",
                        shape.variable = "class"
    )
    
    print(unsupervised.output$plot$PCA)
    print(unsupervised.output$plot$tSNE)
    print(unsupervised.output$plot$UMAP)

```


## 6. Clustering high CV Genes and Heatmap:

 
```{r Clustering high CV Genes, echo=TRUE}
    
    heatmap.output <- heatMap(object = unsupervised.output$object, 
                        ngenes = 200, 
                        scale.by.row.or.col = "row", 
                        show.rownames = FALSE, 
                        show.colnames = FALSE, 
                        clustering.method = "average", 
                        cluster.rows = TRUE, 
                        cluster.cols = TRUE,
                        clustering.distance.rows = "correlation", 
                        clustering.distance.cols = "correlation", 
                        annotation.row = NA, 
                        annotation.col = c("class", "segment", "region"), 
                        breaks.by.values = seq(-3, 3, 0.05), 
                        heatmap.color = colorRampPalette(c("blue", "white", "red"))(120), 
                        norm.method = "quant")


    print(heatmap.output$plot)
    
```


## 7. Differential Expression Analysis:


```{r Differential Expression Analysis,  echo=TRUE}

    goi <- c("CD274", "CD8A", "CD68", "EPCAM",
             "KRT18", "NPHS1", "NPHS2", "CALB1", "CLDN8")
    
    object <- q3.normalization.output$object
    object <- object[goi,]
    Gene <- Subset <- NULL
    
    #First analysis:
    reslist.1 <- diffExpr(object = object, 
                          analysis.type = "Within Groups", 
                          region.col = "region", 
                          regions = c("glomerulus", "tubule"), 
                          group.col = "class", 
                          groups = c("DKD", "normal"), 
                          n.cores = 1)
    grid.draw(reslist.1$sample_table)
    grid.newpage()
    grid.draw(reslist.1$summary_table)
    
    lfc_col1 <- colnames(reslist.1$result)[grepl("logFC",colnames(reslist.1$result))]
    pval_col1 <- colnames(reslist.1$result)[grepl("_pval",colnames(reslist.1$result))]
    
    lfc.1 <- reslist.1$result %>% 
              dplyr::filter(Gene == "CALB1" & Subset == "normal") %>% 
              select(all_of(lfc_col1)) %>% 
              as.numeric()
    pval.1 <- reslist.1$result %>% 
              dplyr::filter(Gene == "CALB1" & Subset == "normal") %>% 
              select(all_of(pval_col1)) %>% 
              as.numeric()
    
    cat(paste0("\n\nvalue of CALB Fold Change is:", lfc.1))
    cat("expected value is -2.014")
    
    cat(paste0("\nvalue of CALB pval is:",pval.1))
    cat("expected value is 0.0274")
    
    #Second analysis:
    reslist.2 <- diffExpr(object = object, 
                          analysis.type = "Between Groups", 
                          region.col = "region", 
                          regions = c("glomerulus", "tubule"), 
                          group.col = "class", 
                          groups = c("DKD", "normal"), 
                          n.cores = 1)
    grid.draw(reslist.2$sample_table)
    grid.newpage()
    grid.draw(reslist.2$summary_table)
    
    lfc_col2 <- colnames(reslist.2$result)[grepl("logFC",colnames(reslist.2$result))]
    pval_col2 <- colnames(reslist.2$result)[grepl("_pval",colnames(reslist.2$result))]
    
    lfc.2 <- reslist.2$result %>% 
              dplyr::filter(Gene == "CALB1" & Subset == "tubule") %>% 
              select(all_of(lfc_col2)) %>% 
              as.numeric()
    pval.2 <- reslist.2$result %>% 
              dplyr::filter(Gene == "CALB1" & Subset == "tubule") %>% 
              select(all_of(pval_col2)) %>% 
              as.numeric()
    
    cat(paste0("\n\nvalue of CALB Fold Change is:", lfc.2))
    cat("expected value is -1.408")
    
    cat(paste0("\nvalue of CALB pval is:",pval.2))
    cat("expected value is 0.01268")
    

```
## 8. Volcano Plot

#This part is run on NIDAP.

## 9. Violin Plot

```{r Violin Plot, echo=TRUE}
    
    genes <- c("CD274", "CD8A", "CD68", "EPCAM",
         "KRT18", "NPHS1", "NPHS2", "CALB1", "CLDN8")
    
    violin.plot.test <- violinPlot(object = q3.normalization.output$object, 
                                        expr.type = "q_norm", 
                                        genes = genes,
                                        group = "region",
                                        facet.by = "segment")
    grid.arrange(violin.plot.test)

```