RSW_CPTR6_analysis_markdown.Rmd

---
title: "Annunziata DSP Analysis for Protein Panel"
output: html_document
date: "2024-02-13"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

knitr::opts_knit$set(root.dir = '/rstudio-files/ccr-dceg-data/users/ned/DSPWorkflow')

# Increase the time out for downloading the DSP package
options(timeout = max(300, getOption("timeout")))
```

## Load the Input Data

``` {r Load Data, echo=TRUE}
# Create a folder to hold the test data. 
# Below we have designated the folder 'test_data'

# Set paths for downloading dcc files
#downloads.path <- "test_data/Human_Kidney/downloaded/"
#tar.file.name <- "kidney_dccs.tar.gz"
#full.tar.path <- paste0(downloads.path,tar.file.name)

# Check if dcc files were previously downloaded
#if (!file.exists(full.tar.path)) {
  
  # Download dcc files and place in data folder
#  data.url <- "http://hpc.nih.gov/~CCBR/DSPWorkflow/kidney_dccs.tar.gz"
#  download.file(data.url, full.tar.path)
#  untar(full.tar.path, exdir = downloads.path)
#}

library(readxl)

project.folder.path <- "/rstudio-files/ccr-dceg-data/users/ned/CPTR/CPTR-6 Annunziata/"

dcc.files <- dir(
  file.path(paste0(project.folder.path, "dccs")),
  pattern = ".dcc$",
  full.names = TRUE,
  recursive = TRUE
)

pkc.files <- c(paste0(project.folder.path, "Mm_P_NGS_Core_v1.0.pkc"),
               paste0(project.folder.path, "Mm_P_NGS_ImmuneActivation_v1.0.pkc"), 
               paste0(project.folder.path, "Mm_P_NGS_ImmuneCellTyping_v1.0.pkc"), 
               paste0(project.folder.path, "Mm_P_NGS_Myeloid_v1.0.pkc"))
annotation.file.path <- paste0(project.folder.path, "CPTR6_Annunziata_annotation.xlsx")

annotation.df <- read_excel("/rstudio-files/ccr-dceg-data/users/ned/CPTR/CPTR-6 Annunziata/CPTR_6_DSP_Annunziata/CPTR6_Annunziata_annotation.xlsx")
```

# Set up the GeoMx Set object

```{r Study Design, echo=TRUE}

library(GeomxTools)

# Establish the names of important annotation columns

slide.name.col <-  "slide name"
group.col <-  "age" 
subgroup.col <- "treatment"
segment.col <- "segment"
area.col <- "area"
nuclei.col <- "nuclei"

# Load all of the input DCC files, annotation, and PKC files
object <-
      readNanoStringGeoMxSet(
        dccFiles = dcc.files,
        pkcFiles = pkc.files,
        phenoDataFile = annotation.file.path,
        phenoDataSheet = "Annotation template",
        phenoDataDccColName = "Sample_ID",
        protocolDataColNames = c("aoi", "roi"),
        experimentDataColNames = c("panel"), 
        analyte = "protein")


# Print out a summary of the object
print(object)

# Rename all of the required columns based on user parameters in data
colnames(object@phenoData@data)[colnames(object@phenoData@data) == slide.name.col] = "slide_name"

# Rename all of the required columns based on user parameters in metadata
rownames(object@phenoData@varMetadata)[rownames(object@phenoData@varMetadata) == slide.name.col] = "slide_name"


# Establish the segment specific IDs
segment.id.length <- 4
pData(object)$segmentID <- paste0(substr(pData(object)[[group.col]], 1, segment.id.length),
                                  "|",
                                  substr(pData(object)[[subgroup.col]], 1, segment.id.length),
                                  "|",
                                  substr(pData(object)$segment, 1, segment.id.length),
                                  "|", 
                                  substr(pData(object)$slide_name, 1, segment.id.length), 
                                  "|", 
                                  sData(object)$roi)


```

# Sankey Plot

```{r}

library(ggplot2)
library(ggforce)

# Define the lanes of the Sankey plot
lane1 <- "treatment"
lane2 <- "age"
lane3 <- "segment"
lane4 <- "slide_name"
fill_lane <- "treatment"


#Establish variables for the Sankey plot
x <- id <- y <- n <- NULL

# select the annotations we want to show, use `` to surround column
# names with spaces or special symbols

# Create a count matrix
count.mat <- count(pData(object), 
                   !!as.name(lane1), 
                   !!as.name(lane2), 
                   !!as.name(lane3), 
                   !!as.name(lane4))

# Remove any rows with NA values
na.per.column <- colSums(is.na(count.mat))
na.total.count <- sum(na.per.column)
                                             
if(na.total.count > 0){
  count.mat <- count.mat[!rowSums(is.na(count.mat)),]
  rownames(count.mat) <- 1:nrow(count.mat)
}
 

# Gather the data and plot in order: lane 1, lane 2, ..., lane n
# gather_set_data creates x, id, y, and n fields within sankey.count.data
# Establish the levels of the Sankey
sankey.count.data <- gather_set_data(count.mat, 1:4)

sankey.count.data$x <-
    factor(
      sankey.count.data$x,
      levels = c(as.name(lane1), as.name(lane2), as.name(lane3), as.name(lane4))
    )
    
  # For position of Sankey 100 segment scale
  adjust.scale.pos = 0

# plot Sankey diagram
sankey.plot <-
  ggplot(sankey.count.data,
         aes(
           x,
           id = id,
           split = y,
           value = n
         )) +
  geom_parallel_sets(aes(fill = !!as.name(fill_lane)), alpha = 0.5, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2) +
  geom_parallel_sets_labels(color = "gray",
                            size = 5,
                            angle = 0) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "bottom",
    axis.ticks.y = element_blank(),
    axis.line = element_blank(),
    axis.text.y = element_blank()
  ) +
  scale_y_continuous(expand = expansion(0)) +
  scale_x_discrete(expand = expansion(0)) +
  labs(x = "", y = "") +
  annotate(
    geom = "segment",
    x = (4.25 - adjust.scale.pos),
    xend = (4.25 - adjust.scale.pos),
    y = 20,
    yend = 120,
    lwd = 2
  ) +
  annotate(
    geom = "text",
    x = (4.19 - adjust.scale.pos),
    y = 70,
    angle = 90,
    size = 5,
    hjust = 0.5,
    label = "100 segments"
  )

print(sankey.plot)

```


## 2. QC Preprocessing:

```{r QC Preprocessing, echo=TRUE}

library(GeomxTools)
library(tibble)
library(dplyr)

results.folder <- "/rstudio-files/ccr-dceg-data/users/ned/CPTR/CPTR-6 Annunziata/results/"

# Set the QC flags using the Nanostring defaults
qc.output <-  setSegmentQCFlags(object, 
                                qcCutoffs = list(
                                  minSegmenReads = 1000, 
                                  percentAligned = 80, 
                                  percentTrimmed = 80, 
                                  percentStitched = 80, 
                                  percentSaturation = 50, 
                                  minNegativeCount = 10,   
                                  maxNTCCount = 60, 
                                  minArea = 16000))

# Review QC table low sequenced AOIs
qc <- protocolData(qc.output)
qc.df <- qc@data
print(qc.df)

# Review control probes
hk.names <- hkNames(qc.output)
print(hk.names)

igg.names <- iggNames(qc.output)
igg.names

fig <- qcProteinSignal(object = qc.output, neg.names = igg.names)

proteinOrder <- qcProteinSignalNames(object = qc.output, neg.names = igg.names)

fig()

# Save the feature plot
feature.plot <- fig()

export.feature.plot <- FALSE
if(export.feature.plot == TRUE){
  ggsave(file.path(paste0(results.folder, "feature_plot.png")), plot = feature.plot, width = 12, height = 10)
}

# Generate a list of flagged segments

## Annotation columns
object <- qc.output
annotation.data <- pData(object)
  

## Annotation column names
annotation.column.names <- colnames(annotation.data)

## Start the list of selected annotation columns
select.annotation.columns <- "segmentID"

## Check if area and nuclei are included
if("area" %in% annotation.column.names){ 
  select.annotation.columns <- c("area", select.annotation.columns)
} 
if("nuclei" %in% annotation.column.names){ 
  select.annotation.columns <- c("nuclei", select.annotation.columns)
}

## The annotation names based on selected columns as a df
## drop = FALSE ensures single column is still a df
select.annotation.data <- annotation.data[, select.annotation.columns, 
                                          drop = FALSE]
select.annotation.data <- rownames_to_column(select.annotation.data, 
                                             var = "SampleID")

# Gather the QC flagged rows

## Gather the qc data
segment.qc.data <- object@protocolData@data

## The nested flag dataframe
flag.columns <- segment.qc.data %>% select(starts_with("QCFlags"))

## Rows with a positive flag
flagged.rows <- flag.columns[rowSums(flag.columns$QCFlags == TRUE) > 0, ]
flagged.rows <- as.data.frame(flagged.rows)
flagged.rows <- rownames_to_column(flagged.rows, var = "SampleID")

# Gather the additional QC data

## Additional QC column names
add.qc.column.names <- c("Raw", 
                         "Trimmed (%)",
                         "Stitched (%)",
                         "Aligned (%)",
                         "Saturated (%)")

## Check for NTC column
if("NTC" %in% colnames(segment.qc.data)){
  add.qc.column.names <- c("NTC", add.qc.column.names)
}

## Additional QC data
add.qc.columns <- segment.qc.data[, add.qc.column.names]
add.qc.columns <- rownames_to_column(add.qc.columns, var = "SampleID")

## Convert single column matrix and dataframes into vectors
## May cause an issue if there is more then one PKC file
matrix.column <- add.qc.columns$NegGeoMean
vector.column <- as.vector(matrix.column)
add.qc.columns$NegGeoMean <- vector.column

add.qc.columns$TrimmedPerc <- sapply(add.qc.columns$`Trimmed (%)`, 
                                     function(x) unname(unlist(x)))
add.qc.columns$TrimmedPerc <- as.vector(add.qc.columns$TrimmedPerc[,1])

add.qc.columns$StitchedPerc <- sapply(add.qc.columns$`Stitched (%)`, 
                                     function(x) unname(unlist(x)))
add.qc.columns$StitchedPerc <- as.vector(add.qc.columns$StitchedPerc[,1])

add.qc.columns$AlignedPerc <- sapply(add.qc.columns$`Aligned (%)`, 
                                     function(x) unname(unlist(x)))
add.qc.columns$AlignedPerc <- as.vector(add.qc.columns$AlignedPerc[,1])

add.qc.columns$SaturatedPerc <- sapply(add.qc.columns$`Saturated (%)`, 
                                     function(x) unname(unlist(x)))
add.qc.columns$SaturatedPerc <- as.vector(add.qc.columns$SaturatedPerc[,1])

## Remove the nested data frames
add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Trimmed (%)")]
add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Stitched (%)")]
add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Aligned (%)")]
add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Saturated (%)")]

# Create the final QC flag data frame with all additional info

## Merge the annotation, flag, and additional qc data frames
merge.qc.flagged.df <- merge(merge(select.annotation.data, add.qc.columns, 
                                   by = "SampleID"), 
                             flagged.rows, by = "SampleID")

## Reorder columns so that info is next to flag
final.column.order <- c("SampleID", 
                        "segmentID", 
                        "Raw", 
                        "LowReads", 
                        "TrimmedPerc", 
                        "LowTrimmed", 
                        "StitchedPerc", 
                        "LowStitched", 
                        "AlignedPerc", 
                        "LowAligned", 
                        "SaturatedPerc", 
                        "LowSaturation")

## Add NTC, area, and/or nuclei if part of annotation
if("NTC" %in% colnames(segment.qc.data)){ 
  final.column.order <- c(final.column.order, "NTC", "HighNTC")
} 
if("area" %in% annotation.column.names){ 
  final.column.order <- c(final.column.order, "area", "LowArea")
} 
if("nuclei" %in% annotation.column.names){ 
  final.column.order <- c(final.column.order, "nuclei", "LowNuclei")
}

## The final QC flag df
final.flagged.segment.df <- merge.qc.flagged.df[, final.column.order]

## Final renaming of columns
colnames(final.flagged.segment.df)[colnames(final.flagged.segment.df) == "Raw"] <- "RawReadCount"

print(final.flagged.segment.df)

qc.folder <- paste0(project.folder.path, "qc/")

export.qc.flag.summary <- FALSE

if(export.qc.flag.summary == TRUE){
  
  write.csv(qc.df, file = paste0(qc.folder, "CPTR6_QC_flag_summary.csv"))
  
}

```

## 3. Filtering:

```{r Filtering, echo=TRUE}
  
  # Can remove segments based on flags
  # Below is how you would remove segments flagged with low saturation

  # low sequenced ROIs
  lowSaturation <- which(as.data.frame(protocolData(qc.output)[["QCFlags"]])["LowSaturation"] == TRUE)

  # remove low quality ROIs and compare the before and after
  #passedQC <- qc.output[, -lowSaturation]
  #dim(qc.output)
  #dim(passedQC)

```


## 4. Normalization:
  
```{r Normalization, echo=TRUE}

  # Check for best normalization type
  
  norm.factors <- computeNormalizationFactors(qc.output, 
                                              igg.names = igg.names, 
                                              hk.names = hk.names, 
                                              area = "area", 
                                              nuclei = "Nuclei count")
  
  # Concordance plots for Negative Controls
  igg.concordance.treatment <- plotConcordance(object = qc.output, 
                                     targetList = igg.names, 
                                     plotFactor = "age")
  
  igg.concordance.age <- plotConcordance(object = qc.output, 
                                     targetList = igg.names, 
                                     plotFactor = "treatment")
  
  print(igg.concordance.treatment)
  print(igg.concordance.age)
  
  # Concordance plots for Housekeeping controls
  hk.concordance.treatment <- plotConcordance(object = qc.output, 
                                    targetList = hk.names, 
                                    plotFactor = "age")
  
  hk.concordance.age <- plotConcordance(object = qc.output, 
                                    targetList = hk.names, 
                                    plotFactor = "treatment")
  
  print(hk.concordance.treatment)
  print(hk.concordance.age)
  
  # Generate normalized counts for each normalization type
  norm.hk <- normalize(qc.output, norm_method="hk", toElt = "hk_norm")
  
  norm.neg <- normalize(qc.output, norm_method="neg", toElt = "neg_norm")
  
  norm.q3 <- normalize(qc.output, norm_method="quant", desiredQuantile = .75, toElt = "q_norm")
  
  # Generate probe figure for negative normalized counts
  probe.signal.hk <- qcProteinSignal(object = norm.neg, neg.names = hk.names)
  
  probe.signal.neg <- qcProteinSignal(object = norm.neg, neg.names = igg.names)

  proteinOrder <- qcProteinSignalNames(object = norm.neg, neg.names = igg.names)

  fig()
  
  
  # Export all read count for raw and normalization methods
  
  ## raw
  raw_counts <- as.data.frame(norm.hk@assayData$exprs)
  feature_list <- rownames(raw_counts)
  raw_counts <- cbind("feature" = feature_list, raw_counts)
  # log counts
  raw_log_counts <- raw_counts %>% mutate(across(starts_with("DSP"),~log2(. + 1)))
  
  ## Housekeeping Normalization
  hk.norm_counts <- norm.hk@assayData$hk_norm
  feature_list <- rownames(hk.norm_counts)
  hk.norm_counts <- cbind("feature" = feature_list, hk.norm_counts)
  # log counts
  hk_log_counts <- as.data.frame(hk.norm_counts) %>% mutate(across(starts_with("DSP"),~ceiling(as.numeric(.x)))) %>% mutate(across(starts_with("DSP"),~log2(. + 1)))
  
  ## Negative Normalization
  neg.norm_counts <- norm.neg@assayData$neg_norm
  feature_list <- rownames(neg.norm_counts)
  neg.norm_counts <- cbind("feature" = feature_list, neg.norm_counts)
  # log counts
  neg_log_counts <- as.data.frame(neg.norm_counts) %>% mutate(across(starts_with("DSP"),~ceiling(as.numeric(.x)))) %>% mutate(across(starts_with("DSP"),~log2(. + 1)))
  
  ## Q3 Normalization
  q3.norm_counts <- norm.q3@assayData$q_norm
  feature_list <- rownames(q3.norm_counts)
  q3.norm_counts <- cbind("feature" = feature_list, q3.norm_counts)
  # log counts
  q3_log_counts <- as.data.frame(q3.norm_counts) %>% mutate(across(starts_with("DSP"),~ceiling(as.numeric(.x)))) %>% mutate(across(starts_with("DSP"),~log2(. + 1)))
  
  # Export all count tables
  
  export_counts <- FALSE
  
  if(export_counts == TRUE){
    
    
    write.csv(hk.norm_counts, 
              file.path(paste0(results.folder, "hk_norm_counts.csv")), 
              row.names = FALSE)
    
    write.csv(neg.norm_counts, 
              file.path(paste0(results.folder, "neg_norm_counts.csv")), 
              row.names = FALSE)
    
    write.csv(q3.norm_counts, 
              file.path(paste0(results.folder, "q3_norm_counts.csv")), 
              row.names = FALSE)
    
    
  }
  
  
  export.norm.counts <- FALSE
  
  if(export.norm.counts == TRUE){
    
    write.csv(neg.norm_counts, 
              file.path(paste0(results.folder, 
                               "neg_norm_counts.csv")), 
              row.names = FALSE)
    
    write.csv(neg_log_counts, 
              file.path(paste0(results.folder, 
                               "neg_norm_log_counts.csv")), 
              row.names = FALSE)
    
  }
  
  
```


# QC Plots for Marker Features Ki-67 and CD45

```{r}

# Set up the counts df for the marker features 

library(dplyr)

# Marker features of interest
marker.features <- c("Ki-67", "CD45")

# Grab the normalized counts for both
marker.counts <- as.data.frame(neg_log_counts) %>% 
  filter(feature %in% marker.features)

# Make the Sample ID as rows
marker.counts.df <- as.data.frame((t(marker.counts)))

# Create a column for the Sample IDs and remove the .dcc
marker.counts.df$Sample_ID <- rownames(marker.counts.df)

# Remove the file extension .dcc
marker.counts.df$Sample_ID <- gsub("\\.dcc$", "", marker.counts.df$Sample_ID)
rownames(marker.counts.df) <- NULL

# Combine the counts with the annotation based on Sample ID
cleaned.annotation.df <- annotation.df[annotation.df$'slide name' != "No Template Control", ]
marker.boxplot.df <- merge(cleaned.annotation.df, marker.counts.df, by = "Sample_ID")

```

### Marker expression per Animal 

```{r}

# Create QC boxplots for all marker features

# Organize annotation
marker.boxplot.df$animal_num <- as.factor(marker.boxplot.df$'Animal #')

# For each marker make a boxplot
for(marker in marker.features){
  
  # Convert counts from character to numeric
  marker.boxplot.df[[marker]] <- as.numeric(marker.boxplot.df[[marker]])
  
  # Set the upper and lower y limits of the plot (log2 counts)
  y.upper.limit <- max(marker.boxplot.df[[marker]]) + 0.5
  y.lower.limit <- min(marker.boxplot.df[[marker]]) - 0.5
    
  # Create a boxplot for for expression per animal #
  animal.number.marker.boxplot <- ggplot(marker.boxplot.df, aes(x = animal_num, y = !!sym(marker), color = age)) + 
  geom_boxplot(notch = FALSE) + 
  ggtitle(paste0(marker, " expression per animal")) +
  scale_y_continuous(labels = scales::comma) + 
  ylim(y.lower.limit, y.upper.limit) + 
    labs(x = "Animal #", y = paste0(marker, " log2 counts"))
  
  
  # Export the marker expression by animal number boxplot
  export.boxplot <- TRUE

  if(export.boxplot == TRUE){
    
    ggsave(paste0(project.folder.path, "qc/marker_qc_plots/", marker, "_boxplot_animal.png"), animal.number.marker.boxplot, width = 12, height = 10)
    
  }
  
}


```
### Marker expression per ROI

```{r}
library(Polychrome)
# Reference for Polychrome:
# https://cran.r-project.org/web/packages/Polychrome/vignettes/creatingPalettes.html

# Add annotation for individual ROIs
marker.boxplot.df$animal_roi <- paste0(marker.boxplot.df$`Animal #`, "_", marker.boxplot.df$roi)

# For each marker make a boxplot
for(marker in marker.features){
  
  # Convert counts from character to numeric
  marker.boxplot.df[[marker]] <- as.numeric(marker.boxplot.df[[marker]])
  
  # Set the upper and lower y limits of the plot (log2 counts)
  y.upper.limit <- max(marker.boxplot.df[[marker]]) + 0.5
  y.lower.limit <- min(marker.boxplot.df[[marker]]) - 0.5

  # Separate by age
  for(age.type in unique(marker.boxplot.df$age)){
    
    # Subset the marker data by age
    marker.boxplot.df.age <- marker.boxplot.df %>%filter(age == age.type)
    
    # Create colors for each animal to distinguish the ROIs
    animal.numbers <- unique(marker.boxplot.df.age$animal_num)
    animal.num.colors <- unname(createPalette(length(animal.numbers), 
                                            c("#ff0000", "#00ff00", "#0000ff"), 
                                            M = 1000, 
                                            range = c(10,70)))
    
    # Create the new boxplot by age
    roi.marker.boxplot <- ggplot(marker.boxplot.df.age, 
                                 aes(x = animal_roi, 
                                     y = !!sym(marker), 
                                     color = animal_num)) + 
    geom_boxplot(notch = FALSE) + 
    ggtitle(paste0(marker, " expression per roi in ", age.type)) +
    scale_y_continuous(labels = scales::comma) + 
    ylim(y.lower.limit, y.upper.limit) + 
    labs(x = "Animal#_ROI", y = paste0(marker, " log2 counts")) + 
    theme(axis.text.x = element_text(size = 8, angle = 90)) + 
    facet_wrap(.~treatment, scales = "free_x") + 
    scale_color_manual(values=animal.num.colors)
      
    # Export the boxplot named by age
      export.age.boxplot <- TRUE

    if(export.age.boxplot == TRUE){
      ggsave(paste0(project.folder.path, "qc/marker_qc_plots/", marker, "_boxplot_roi_", age.type, ".png"), roi.marker.boxplot, width = 12, height = 10)
    }
  
  }
    
  # Create a boxplot for for expression per animal #
  roi.combined.marker.boxplot <- ggplot(marker.boxplot.df, aes(x = animal_roi, y = !!sym(marker), color = treatment)) + 
  geom_boxplot(notch = FALSE) + 
  ggtitle(paste0(marker, " expression per roi")) +
  scale_y_continuous(labels = scales::comma) + 
  ylim(y.lower.limit, y.upper.limit) + 
  labs(x = "Animal #", y = paste0(marker, " log2 counts")) + 
  theme(axis.text.x = element_text(size = 5, angle = 90)) + 
  facet_wrap(.~age, scales = "free_x")
  
  
  # Export the marker expression by animal number boxplot
  export.combined.boxplot <- TRUE

  if(export.combined.boxplot == TRUE){
    
    ggsave(paste0(project.folder.path, "qc/marker_qc_plots/", marker, "_boxplot_roi_combined.png"), roi.combined.marker.boxplot, width = 12, height = 10)
    
  }
  
}

```

# QC plots for Nuclei Count

### Nuclei count per ROI

```{r}

# Convert nuclei counts from to a numeric column with no spaces
marker.boxplot.df$`nuclei` <- as.numeric(marker.boxplot.df$`Nuclei count`)

# Set the upper and lower y limits of the plot (log2 counts)
y.upper.limit <- max(marker.boxplot.df$`nuclei`) + 10
y.lower.limit <- min(marker.boxplot.df$`nuclei`) - 10
# Separate by age
for(age.type in unique(marker.boxplot.df$age)){
  
  # Subset the marker data by age
  marker.boxplot.df.age <- marker.boxplot.df %>% filter(age == age.type)
  
  # Subset the marker data by age
  marker.boxplot.df.age <- marker.boxplot.df %>%filter(age == age.type)
  
  # Create colors for each animal to distinguish the ROIs
  # Reference for Polychrome:
  # https://cran.r-project.org/web/packages/Polychrome/vignettes/creatingPalettes.html
  animal.numbers <- unique(marker.boxplot.df.age$animal_num)
  animal.num.colors <- unname(createPalette(length(animal.numbers), 
                                            c("#ff0000", "#00ff00", "#0000ff"), 
                                            M = 1000, 
                                            range = c(10,70)))
  
  
  # Create the new boxplot by age
  roi.nuclei.boxplot <- ggplot(marker.boxplot.df.age, aes(x = animal_roi, y = nuclei, color = animal_num)) + 
  geom_boxplot(notch = FALSE) + 
  ggtitle(paste0("Nuclei count per roi in ", age.type)) +
  scale_y_continuous(labels = scales::comma) + 
  ylim(y.lower.limit, y.upper.limit) + 
  labs(x = "Animal#_ROI", y = "Nuclei count") + 
  theme(axis.text.x = element_text(size = 8, angle = 90)) + 
  facet_wrap(.~treatment, scales = "free_x") + 
  scale_color_manual(values = animal.num.colors)
    
  # Export the boxplot named by age
    export.age.boxplot <- TRUE
  if(export.age.boxplot == TRUE){
    ggsave(paste0(project.folder.path, "qc/nuclei_qc_plots/boxplot_nuclei_roi_", age.type, ".png"), roi.nuclei.boxplot, width = 12, height = 12)
  }

}
```


```{r}
# Create the boxplot per animal number

ki67.barplot.df$`animal_num` <- as.factor(ki67.barplot.df$`Animal #`)

ki67.boxplot.animal <- ggplot(ki67.barplot.df, aes(x = animal_num, y = Ki_67)) + 
  geom_boxplot(notch = FALSE) + 
  ggtitle("Ki-67 expression per animal") + 
  scale_y_continuous(labels = scales::comma) + 
  ylim(0, 30000)


# Export the boxplot
export.boxplot <- FALSE

if(export.boxplot == TRUE){
  ggsave(paste0(project.folder.path, "results/ki67_boxplot_animal.png"), ki67.boxplot.animal, width = 12, height = 10)
}


```

# Additional box/bar plots for CD45

```{r}

# Make the boxplots for CD45

# Set up the normalized counts

# Grab the counts
cd45.counts <- hk.norm_counts["CD45", , drop = FALSE]

# Make the Sample ID as rows
cd45.counts.df <- as.data.frame(t(cd45.counts))

# Create a column for the Sample IDs and remove the .dcc
cd45.counts.df$Sample_ID <- rownames(cd45.counts.df)
cd45.counts.df$Sample_ID <- gsub("\\.dcc$", "", cd45.counts.df$Sample_ID)
rownames(cd45.counts.df) <- NULL


# Combine the counts with the annotation based on Sample ID
cd45.barplot.df <- merge(cleaned.annotation.df, cd45.counts.df, by = "Sample_ID")

cd45.barplot.df$`CD45` <- as.numeric(cd45.barplot.df$`CD45`)

# Create the boxplots for the two age groups, then combine into a single image

#cd45.barplot.df.old <- cd45.barplot.df[cd45.barplot.df$age == "old", ]
  
#cd45.barplot.df.young <- cd45.barplot.df[cd45.barplot.df$age == "young", ]

#cd45.boxplot.old <- ggplot(cd45.barplot.df.old, aes(x = treatment, y = CD45)) + 
#  geom_boxplot(notch = FALSE) + 
#  ggtitle("Old") + 
#  scale_y_continuous(labels = scales::comma) + 
#  ylim(0, 30000)

#cd45.bp.old.plotly <- plotly::ggplotly(cd45.boxplot.old)

#cd45.boxplot.young <- ggplot(cd45.barplot.df.young, aes(x = treatment, y = CD45)) + 
#  geom_boxplot(notch = FALSE) + 
#  ggtitle("Young") + 
#  scale_y_continuous(labels = scales::comma) + 
#  ylim(0, 30000)

#cd45.combined.boxplot <- grid.arrange(cd45.boxplot.young, cd45.boxplot.old, ncol = 2)

#title <- textGrob("CD45 signal by Age and Treatment", gp = gpar(fontsize = 20, fontface = "bold"))

#final.cd45.boxplot <- grid.arrange(title, cd45.combined.boxplot, heights = c(0.1, 0.9))

#grid.draw(final.cd45.boxplot)

#export.boxplot <- FALSE

#if(export.boxplot == TRUE){
#  ggsave(paste0(project.folder.path, "results/cd45_boxplot_age.png"), final.cd45.boxplot, width = 12, height = 10)
#}


# Create the boxplot per animal number

cd45.barplot.df$`animal_num` <- as.factor(cd45.barplot.df$`Animal #`)

cd45.boxplot.animal <- ggplot(cd45.barplot.df, aes(x = animal_num, y = CD45)) + 
  geom_boxplot(notch = FALSE) + 
  ggtitle("CD45 expression per animal") + 
  scale_y_continuous(labels = scales::comma) + 
  ylim(0, 30000)


# Export the boxplot
export.boxplot <- FALSE

if(export.boxplot == TRUE){
  ggsave(paste0(project.folder.path, "results/cd45_boxplot_animal.png"), cd45.boxplot.animal, width = 12, height = 10)
}


# Boxplot for Nuclei count per ROI and Ki-67 expression

ki67.barplot.df$`nuclei` <- as.numeric(ki67.barplot.df$`Nuclei count`)

ki67.barplot.df$animal_roi <- paste0(ki67.barplot.df$`Animal #`, "_", ki67.barplot.df$roi)

# Create a plot labeled for plotly
ki67.nuclei.dot.plotly <- ggplot(ki67.barplot.df, 
                               aes(x = nuclei, y = Ki_67, text = paste0("Animal_ROI: ", animal_roi))) +
                          geom_point() +
                          labs(x = "Nuclei Count", 
                               y = "Ki-67 Expression", 
                               title = "Nuclei Count by Ki-67 Expression")
# Visualize the plotly plot
ggplotly(ki67.nuclei.dot.plotly)

# Create a plot without a trend line
ki67.nuclei.dot.plot <- ggplot(ki67.barplot.df, 
                               aes(x = nuclei, y = Ki_67)) +
                          geom_point() +
                          labs(x = "Nuclei Count", 
                               y = "Ki-67 Expression", 
                               title = "Nuclei Count by Ki-67 Expression")

# Export the basic dot plot
export.dotplot <- FALSE
if(export.dotplot == TRUE){
  ggsave(paste0(project.folder.path, "results/ki67_nuclei_dotplot.png"), ki67.nuclei.dot.plot, width = 9, height = 8)
}

# Create a plot with a trend line
ki67.nuclei.dot.plot.trend <- ggplot(ki67.barplot.df, 
                               aes(x = nuclei, y = Ki_67)) +
                          geom_point() +
                          geom_smooth(method = "lm", formula = y ~ x, se = TRUE) +
                          labs(x = "Nuclei Count", 
                               y = "Ki-67 Expression", 
                               title = "Nuclei Count by Ki-67 Expression")

# Check if there is a relationship between nuclei and CD45 expression
cor.test(ki67.barplot.df$nuclei, ki67.barplot.df$Ki_67, method = "spearman")


# Create the plot of expression per ROI

ki67.boxplot.animal.roi <- ggplot(ki67.barplot.df, aes(x = animal_roi, y = Ki_67)) + 
  geom_boxplot(notch = FALSE) + 
  ggtitle("Ki-67 expression per animal") + 
  scale_y_continuous(labels = scales::comma) + 
  ylim(0, 30000)


# Export the boxplot
export.boxplot <- FALSE

if(export.boxplot == TRUE){
  ggsave(paste0(project.folder.path, "results/ki67_boxplot_animal.png"), ki67.boxplot.animal, width = 12, height = 10)
}


# Boxplot for Nuclei count per ROI and CD45 expression

cd45.barplot.df$`nuclei` <- as.numeric(cd45.barplot.df$`Nuclei count`)

cd45.barplot.df$animal_roi <- paste0(cd45.barplot.df$`Animal #`, "_", cd45.barplot.df$roi)

# Create a plot labeled for plotly
cd45.nuclei.dot.plotly <- ggplot(cd45.barplot.df, 
                               aes(x = nuclei, y = CD45, text = paste0("Animal_ROI: ", animal_roi))) +
                          geom_point() +
                          labs(x = "Nuclei Count", 
                               y = "CD45 Expression", 
                               title = "Nuclei Count by CD45 Expression")
# Visualize the plotly plot
ggplotly(cd45.nuclei.dot.plot)

# Create a plot with a trend line
cd45.nuclei.dot.plot <- ggplot(cd45.barplot.df, 
                               aes(x = nuclei, y = CD45)) +
                          geom_point() + 
                          labs(x = "Nuclei Count", 
                               y = "CD45 Expression", 
                               title = "Nuclei Count by CD45 Expression")

# Export the basic dot plot
export.dotplot <- FALSE
if(export.dotplot == TRUE){
  ggsave(paste0(project.folder.path, "results/cd45_nuclei_dotplot.png"), cd45.nuclei.dot.plot, width = 9, height = 8)
}

# Create a plot with a trend line
cd45.nuclei.dot.plot.trend <- ggplot(cd45.barplot.df, 
                               aes(x = nuclei, y = CD45)) +
                          geom_point() +
                          geom_smooth(method = "lm", formula = y ~ x, se = TRUE) +
                          labs(x = "Nuclei Count", 
                               y = "CD45 Expression", 
                               title = "Nuclei Count by CD45 Expression")

# Check if there is a relationship between nuclei and CD45 expression
cor.test(cd45.barplot.df$nuclei, cd45.barplot.df$CD45, method = "spearman")


# Create the boxplot per ROI

cd45.barplot.df$`animal_num` <- as.factor(cd45.barplot.df$`Animal #`)

cd45.boxplot.animal <- ggplot(cd45.barplot.df, aes(x = animal_num, y = CD45)) + 
  geom_boxplot(notch = FALSE) + 
  ggtitle("CD45 expression per animal") + 
  scale_y_continuous(labels = scales::comma) + 
  ylim(0, 30000)


# Export the boxplot
export.boxplot <- FALSE

if(export.boxplot == TRUE){
  ggsave(paste0(project.folder.path, "results/cd45_boxplot_animal.png"), cd45.boxplot.animal, width = 12, height = 10)
}


# Create bar plot for nuclei count per ROI

# Boxplot for Nuclei count per ROI and Ki-67 expression

ki67.barplot.df$`nuclei` <- as.numeric(ki67.barplot.df$`Nuclei count`)

# Create a plot labeled for plotly
nuclei.roi.bar.plot <- ggplot(ki67.barplot.df, 
                               aes(x = animal_roi, y = nuclei, fill = animal_num)) +
                          geom_bar(stat = "identity") +
                          labs(x = "ROI", 
                               y = "Nuclei Count", 
                               title = "Nuclei Count by ROI Expression") + 
                          theme(axis.text.x = element_text(size = 6))

nuclei.roi.bar.plot <- ggplot(ki67.barplot.df, 
                               aes(x = animal_roi, y = nuclei)) +
                          geom_bar(stat = "identity") +
                          labs(x = "ROI", 
                               y = "Nuclei Count", 
                               title = "Nuclei Count by ROI Expression") + 
                          theme(axis.text.x = element_text(size = 6))
# Visualize the plotly plot
ggplotly(nuclei.roi.bar.plot)


```

``` {r}
# Bar plots

# Create the bar plot
#ki67.boxplot <- ggplot(ki67.barplot.df, aes(x = treatment, y = Ki_67)) +
#  geom_boxplot(stat = "identity", 
#               outlier.colour="black", 
#               outlier.shape=16, 
#               outlier.size=2, 
#               notch=FALSE) +
#  labs(x = "age", y = "Counts", title = "Counts of Ki-67") +
#  scale_y_continuous(labels = scales::comma)

#ki67.boxplot <- ggplot(ki67.barplot.df, aes(x = treatment, y = Ki_67)) + 
#  geom_bar()
#  geom_errorbar()
  
#ggplot(data) +
#    geom_bar( aes(x=name, y=value), stat="identity", fill="skyblue", alpha=0.7) +
#    geom_errorbar( aes(x=name, ymin=value-sd, ymax=value+sd), width=0.4, colour="orange", alpha=0.9, #size=1.3)

```


## 5. Unsupervised Analysis:

```{r Unsupervised Analysis, echo=TRUE}
    
    library(gridExtra)
  
    ###
    ## House Keeping Normalization
    ###
    # Generate PCA/TSNE/UMAP:
    unsupervised.output <- dimReduct(object = norm.hk, 
                        assay.data = "hk_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "region",
                        shape.variable = "class"
    )
    
    hk_norm_pca <- unsupervised.output$plot$PCA + ggtitle("Housekeeping")
    print(unsupervised.output$plot$PCA)
    print(unsupervised.output$plot$tSNE)
    print(unsupervised.output$plot$UMAP)
    
    ggsave(file.path(paste0(results.folder, "hk_norm_pca.png")), 
           plot = unsupervised.output$plot$PCA, height = 12, width = 20)
    ggsave(file.path(paste0(results.folder, "hk_norm_tsne.png")), 
           plot = unsupervised.output$plot$tSNE, height = 12, width = 20)
    ggsave(file.path(paste0(results.folder, "hk_norm_umap.png")), 
           plot = unsupervised.output$plot$UMAP, height = 12, width = 20)
    
    ###
    ## Negative Normalization
    ###
    # Generate PCA/TSNE/UMAP for treatment:
    unsupervised.output <- dimReduct(object = norm.neg, 
                        assay.data = "neg_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "class"
    )
    
    neg_norm_pca <- unsupervised.output$plot$PCA + ggtitle("Negative")
    print(unsupervised.output$plot$PCA)
    print(unsupervised.output$plot$tSNE)
    print(unsupervised.output$plot$UMAP)
    
    
    ggsave(file.path(paste0(results.folder, "PCA/neg_norm_treatment_pca.png")), 
           plot = unsupervised.output$plot$PCA, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/neg_norm_treatment_tsne.png")), 
           plot = unsupervised.output$plot$tSNE, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/neg_norm_treatment_umap.png")), 
           plot = unsupervised.output$plot$UMAP, height = 6, width = 10)
    
    # Generate PCA/TSNE/UMAP for age:
    unsupervised.output <- dimReduct(object = norm.neg, 
                        assay.data = "neg_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "region"
    )
    
    neg_norm_pca <- unsupervised.output$plot$PCA + ggtitle("Negative")
    print(unsupervised.output$plot$PCA)
    print(unsupervised.output$plot$tSNE)
    print(unsupervised.output$plot$UMAP)
    
    
    ggsave(file.path(paste0(results.folder, "PCA/neg_norm_age_pca.png")), 
           plot = unsupervised.output$plot$PCA, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/neg_norm_age_tsne.png")), 
           plot = unsupervised.output$plot$tSNE, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/neg_norm_age_umap.png")), 
           plot = unsupervised.output$plot$UMAP, height = 6, width = 10)
    
    ###
    ## Q3 Normalization
    ###
    # Generate PCA/TSNE/UMAP:
    
    # Generate for treatment
    unsupervised.output <- dimReduct(object = norm.q3, 
                        assay.data = "q_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "class"
    )
    
    q_norm_pca <- unsupervised.output$plot$PCA + ggtitle("Quartile 3")
    print(unsupervised.output$plot$PCA)
    print(unsupervised.output$plot$tSNE)
    print(unsupervised.output$plot$UMAP)
    
    ggsave(file.path(paste0(results.folder, "PCA/q3_norm_treatment_pca.png")), 
           plot = unsupervised.output$plot$PCA, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/q3_norm_treatment_tsne.png")), 
           plot = unsupervised.output$plot$tSNE, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/q3_norm_treatment_umap.png")), 
           plot = unsupervised.output$plot$UMAP, height = 6, width = 10)
    
    
    # Generate for age 
    
    unsupervised.output <- dimReduct(object = norm.q3, 
                        assay.data = "q_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "region"
    )
    
    q_norm_pca <- unsupervised.output$plot$PCA + ggtitle("Quartile 3")
    print(unsupervised.output$plot$PCA)
    print(unsupervised.output$plot$tSNE)
    print(unsupervised.output$plot$UMAP)
    
    ggsave(file.path(paste0(results.folder, "PCA/q3_norm_age_pca.png")), 
           plot = unsupervised.output$plot$PCA, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/q3_norm_age_tsne.png")), 
           plot = unsupervised.output$plot$tSNE, height = 6, width = 10)
    ggsave(file.path(paste0(results.folder, "PCA/q3_norm_age_umap.png")), 
           plot = unsupervised.output$plot$UMAP, height = 6, width = 10)
    
    
  hk_norm_pca <- hk_norm_pca + theme(legend.position = "none")
  neg_norm_pca <- neg_norm_pca + theme(legend.position = "none")
  q_norm_pca <- q_norm_pca + theme(legend.position = "none")
  # Assuming plot1, plot2, and plot3 are your individual plots
  combined_pca <- grid.arrange(hk_norm_pca, neg_norm_pca, q_norm_pca, ncol = 3)

  # Save the combined plot
  ggsave(file.path(paste0(results.folder, "combined_pca.png")),
         plot = combined_pca, 
         height = 12, width = 40)

```

# Use PCA tools

```{r}

# See reference vignette: https://bioconductor.org/packages/release/bioc/vignettes/PCAtools/inst/doc/PCAtools.html#introduction

library(stringr)
library(dplyr)
library(PCAtools)


```


# Unsupervised subset for age groups

```{r}


# Neg normalization

# young group

neg_young_geomxset_object <- norm.neg[, pData(norm.neg)$region == "young"]

unsupervised.output <- dimReduct(object = neg_young_geomxset_object, 
                        assay.data = "neg_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "class"
    )

ggsave(file.path(paste0(results.folder, "PCA/neg_norm_treatment_pca_young.png")), 
      plot = unsupervised.output$plot$PCA, height = 6, width = 10)

# old group

neg_old_geomxset_object <- norm.neg[, pData(norm.neg)$region == "old"]

unsupervised.output <- dimReduct(object = neg_old_geomxset_object, 
                        assay.data = "neg_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "class"
    )

ggsave(file.path(paste0(results.folder, "PCA/neg_norm_treatment_pca_old.png")), 
      plot = unsupervised.output$plot$PCA, height = 6, width = 10)

# Q3 normalization

q3_old_geomxset_object <- norm.q3[, pData(norm.q3)$region == "young"]

unsupervised.output <- dimReduct(object = q3_old_geomxset_object, 
                        assay.data = "q_norm", 
                        point.size = 2.5,
                        point.alpha = 0.7, 
                        symbol.size = 1, 
                        text.size = 8, 
                        color.variable1 = "class"
    )

ggsave(file.path(paste0(results.folder, "PCA/q3_norm_treatment_pca_young.png")), 
      plot = unsupervised.output$plot$PCA, height = 6, width = 10)

```

## 6. Clustering high CV Genes and Heatmap:

 
```{r Clustering high CV Genes, echo=TRUE}

    library(pheatmap)
    
    # Prepare the normalized counts as a df
    neg.norm.df <- as.data.frame(neg.norm_counts)
    
    # Convert dcc columns to numeric
    dcc.columns <- neg.norm.df[, -1]
    dcc.columns <- apply(dcc.columns, 2, as.numeric)
    neg.norm.df <- cbind(neg.norm.df[, 1, drop = FALSE], dcc.columns)
    
    # Convert to log counts
    log.norm.df <- log2(neg.norm.df[, -1] + 1)
    #log.norm.df <- cbind(hk.norm.df[, 1, drop = FALSE], log.norm.counts)
    
    # Set up the heatmap annotation with select columns
    
    # Columns to plot on heatmap
    anno.columns <- c("age","treatment")
    
    # Set up the annotation df using the cleaned df with NTCs removed
    cleaned.annotation.df <- as.data.frame(cleaned.annotation.df)
    rownames(cleaned.annotation.df) <- cleaned.annotation.df$Sample_ID
    
    # Subset and order the annotation for the selected columns
    # Arrange the annotations with the biggest group as the final arrange
    heatmap.annotation <- cleaned.annotation.df[ ,anno.columns]
    heatmap.annotation <- heatmap.annotation %>% 
      arrange(treatment) %>% 
      arrange(age)
    
    # Define the colors for the annotation
    anno.colors = list(
      age = c(young = "seagreen", 
              old = "salmon2"), 
      treatment = c(BLZ945 = "yellow", 
                    'IP-549' = "orange", 
                    control = "slategray")
      )
    

    # Remove the .dcc from the sample IDs in the log counts df
    colnames(log.norm.df) <- gsub("\\.dcc$", "", colnames(log.norm.df))
    
  
    # Reorder the data columns to they match the annotation row order
    
    row.order <- rownames(heatmap.annotation)
    log.norm.df <- log.norm.df[, row.order]
    
    # Make the heatmap
    heatmap.all.proteins <- pheatmap(log.norm.df, 
         main = "All Protein Signatures", 
         show_rownames = TRUE, 
         scale = "row",   
         show_colnames = FALSE,
         border_color = NA, 
         cluster_rows = TRUE, 
         cluster_cols = FALSE, 
         clustering_method = "average", 
         clustering_distance_rows = "correlation", 
         clustering_distance_cols = "correlation", 
         color = colorRampPalette(c("blue", "white", "red"))(120), 
         annotation_row = NA, 
         annotation_col = heatmap.annotation, 
         annotation_colors = anno.colors
         )
    
    # Export the heatmap
    
    export.heatmap <- TRUE
    if(export.heatmap == TRUE){
  
    ggsave(paste0(project.folder.path, "results/heatmap_all_proteins.png"),
         plot = heatmap.all.proteins, 
         width = 12, 
         height = 10, 
         units = "in")
  
    }
    
    # Make heatmap with column clustering
    
    heatmap.all.proteins.clust <- pheatmap(log.norm.df, 
         main = "All Protein Signatures", 
         show_rownames = TRUE, 
         scale = "row",   
         show_colnames = FALSE,
         border_color = NA, 
         cluster_rows = TRUE, 
         cluster_cols = TRUE, 
         clustering_method = "average", 
         clustering_distance_rows = "correlation", 
         clustering_distance_cols = "correlation", 
         color = colorRampPalette(c("blue", "white", "red"))(120), 
         annotation_row = NA, 
         annotation_col = heatmap.annotation, 
         annotation_colors = anno.colors
         )
    
    # Export the heatmap
    
    export.heatmap <- TRUE
    if(export.heatmap == TRUE){
  
    ggsave(paste0(project.folder.path, "results/heatmap_all_proteins_clust.png"),
         plot = heatmap.all.proteins.clust, 
         width = 12, 
         height = 10, 
         units = "in")
  
    }
    
```

## Differential Expression Analysis: Old group


```{r}

# Set up the data frames with normalized counts and annotations
# Make the Sample ID as rows

norm.type <- "neg"

if(norm.type == "hk"){
  
  norm.counts <- hk.norm_counts
  
} else if(norm.type == "neg"){ 
  
  norm.counts <- neg.norm_counts
  
} else if(norm.type == "q3"){
  
  norm.counts <- q3.norm_counts
  
}


# Create a list to hold all of the DEG df results
deg.df.list <- list()


```

```{r Differential Expression Analysis,  echo=TRUE}


library(stats)
library(dplyr)
library(tidyr)

counts.transposed <- as.data.frame(t(norm.counts))

# Create a column for the Sample IDs and remove the .dcc
counts.transposed$Sample_ID <- rownames(counts.transposed)
counts.transposed$Sample_ID <- gsub("\\.dcc$", "", counts.transposed$Sample_ID)
rownames(counts.transposed) <- NULL


# Combine the counts with the annotation based on Sample ID
diff.eq.df <- merge(cleaned.annotation.df, counts.transposed, by = "Sample_ID")


# BLZ945 treatment versus control

subset.diff.eq <- subset(diff.eq.df, age == "old" & treatment %in% c("BLZ945", "control"))

subset.diff.eq$treatment <- as.factor(subset.diff.eq$treatment)

anova_results <- sapply(colnames(subset.diff.eq)[-(1:23)], function(gene) {
  fit <- lm(get(gene) ~ treatment, data = subset.diff.eq)
  summary_fit <- summary(fit)
  p_value <- summary_fit$coefficients[2, 4]
  log_fc <- summary_fit$coefficients[2, 1]
  c(p_value, log_fc)
})

# Extract p-values and log-fold changes for valid genes
p_values <- anova_results[1, ]
log_fold_changes <- anova_results[2, ]

# Create results data frame for valid genes
old.blz945.results_df <- data.frame(
  p_value = p_values,
  log_fold_change = log_fold_changes
)

old.blz945.results_df$gene <- rownames(old.blz945.results_df)
rownames(old.blz945.results_df) <- NULL

# Adjust p-values for multiple testing
old.blz945.results_df$adjusted_p_value <- p.adjust(old.blz945.results_df$p_value, method = "fdr")

# Print first few rows of results
print(head(old.blz945.results_df))

write.csv(old.blz945.results_df, file.path(paste0(results.folder, norm.type, "_DE_Old_BLZ945.csv")), 
              row.names = FALSE)


# Explore DEGs
old.blz.degs <- old.blz945.results_df[old.blz945.results_df$adjusted_p_value < 0.05, ]

deg.df.list[[paste0(norm.type, ".old.blz")]] <- old.blz.degs


# Create a dataframe of counts for just the DEGs

# Gather the DEG names
old.blz.deg.list <- old.blz.degs$gene

# Gather the annotations to include
anno.columns.deg.counts <- c("Sample_ID", "treatment", "Animal #", "roi")
deg.count.columns <- append(old.blz.deg.list, anno.columns.deg.counts)

# Get only the counts for DEGs
old.blz.deg.counts <- subset.diff.eq[, deg.count.columns, drop = FALSE]

# Reorder the coutns by treatment
old.blz.deg.counts <- old.blz.deg.counts[order(old.blz.deg.counts$treatment), ]

# Correct bad target names
names(old.blz.deg.counts) <- gsub(" ", "_", names(old.blz.deg.counts))

# Create the summary table
#summary_df <- old.blz.deg.counts %>%
#  group_by(treatment) %>%
#  summarize(RbIgG_mean = mean(as.numeric(Rb_IgG)),
#            CD31_mean = mean(as.numeric(CD31)), 
#            CD19_mean = mean(as.numeric(CD19))
#            )

# Print summary dataframe
#print(summary_df)


# IP549 treatment versus control

subset.diff.eq <- subset(diff.eq.df, age == "old" & treatment %in% c("IP-549", "control"))

subset.diff.eq$treatment <- as.factor(subset.diff.eq$treatment)

anova_results <- sapply(colnames(subset.diff.eq)[-(1:23)], function(gene) {
  fit <- lm(get(gene) ~ treatment, data = subset.diff.eq)
  summary_fit <- summary(fit)
  p_value <- summary_fit$coefficients[2, 4]
  log_fc <- summary_fit$coefficients[2, 1]
  c(p_value, log_fc)
})

valid_genes <- complete.cases(anova_results)

# Extract p-values and log-fold changes for valid genes
p_values <- anova_results[1, valid_genes]
log_fold_changes <- anova_results[2, valid_genes]

# Extract gene IDs for valid genes
gene_ids <- names(anova_results)[valid_genes]

# Create results data frame for valid genes
old.ip549.results_df <- data.frame(
  p_value = p_values,
  log_fold_change = log_fold_changes
)

old.ip549.results_df$gene <- rownames(old.ip549.results_df)
rownames(old.ip549.results_df) <- NULL

# Adjust p-values for multiple testing
old.ip549.results_df$adjusted_p_value <- p.adjust(old.ip549.results_df$p_value, method = "fdr")

# Print first few rows of results
print(head(old.ip549.results_df))

write.csv(old.ip549.results_df, file.path(paste0(results.folder, norm.type, "DE_Old_IP549.csv")), 
              row.names = FALSE)

# Explore DEGs
old.ip549.degs <- old.ip549.results_df[old.ip549.results_df$adjusted_p_value < 0.05, ]

deg.df.list[[paste0(norm.type, ".old.ip549")]] <- old.ip549.degs

```

## Differential Expression Analysis: Young group


```{r}
# Young BLZ945

subset.diff.eq <- subset(diff.eq.df, age == "young" & treatment %in% c("BLZ945", "control"))

subset.diff.eq$treatment <- as.factor(subset.diff.eq$treatment)

anova_results <- sapply(colnames(subset.diff.eq)[-(1:23)], function(gene) {
  fit <- lm(get(gene) ~ treatment, data = subset.diff.eq)
  summary_fit <- summary(fit)
  p_value <- summary_fit$coefficients[2, 4]
  log_fc <- summary_fit$coefficients[2, 1]
  c(p_value, log_fc)
})

valid_genes <- complete.cases(anova_results)

# Extract p-values and log-fold changes for valid genes
p_values <- anova_results[1, valid_genes]
log_fold_changes <- anova_results[2, valid_genes]

# Extract gene IDs for valid genes
gene_ids <- names(anova_results)[valid_genes]

# Create results data frame for valid genes
young.blz945.results_df <- data.frame(
  p_value = p_values,
  log_fold_change = log_fold_changes
)

young.blz945.results_df$gene <- rownames(young.blz945.results_df)
rownames(young.blz945.results_df) <- NULL

# Adjust p-values for multiple testing
young.blz945.results_df$adjusted_p_value <- p.adjust(young.blz945.results_df$p_value, method = "fdr")

# Print first few rows of results
print(head(young.blz945.results_df))

write.csv(young.blz945.results_df, file.path(paste0(results.folder, "DE_Young_BLZ945.csv")), 
              row.names = FALSE)

young.blz.degs <- young.blz945.results_df[young.blz945.results_df$adjusted_p_value < 0.05, ]

deg.df.list[[paste0(norm.type, ".young.blz")]] <- young.blz.degs

# Young IP549

subset.diff.eq <- subset(diff.eq.df, age == "young" & treatment %in% c("IP-549", "control"))

subset.diff.eq$treatment <- as.factor(subset.diff.eq$treatment)

anova_results <- sapply(colnames(subset.diff.eq)[-(1:23)], function(gene) {
  fit <- lm(get(gene) ~ treatment, data = subset.diff.eq)
  summary_fit <- summary(fit)
  p_value <- summary_fit$coefficients[2, 4]
  log_fc <- summary_fit$coefficients[2, 1]
  c(p_value, log_fc)
})

valid_genes <- complete.cases(anova_results)

# Extract p-values and log-fold changes for valid genes
p_values <- anova_results[1, valid_genes]
log_fold_changes <- anova_results[2, valid_genes]

# Extract gene IDs for valid genes
gene_ids <- names(anova_results)[valid_genes]

# Create results data frame for valid genes
young.ip549.results_df <- data.frame(
  p_value = p_values,
  log_fold_change = log_fold_changes
)

young.ip549.results_df$gene <- rownames(young.ip549.results_df)
rownames(young.ip549.results_df) <- NULL

# Adjust p-values for multiple testing
young.ip549.results_df$adjusted_p_value <- p.adjust(young.ip549.results_df$p_value, method = "fdr")

# Print first few rows of results
print(head(young.ip549.results_df))

write.csv(young.ip549.results_df, file.path(paste0(results.folder, "DE_Young_IP549.csv")), 
              row.names = FALSE)

young.ip549.degs <- young.ip549.results_df[young.ip549.results_df$adjusted_p_value < 0.05, ]

deg.df.list[[paste0(norm.type, ".young.ip549")]] <- young.ip549.degs

```

# Limma


```{r}

library(dplyr)
library(stringr)

# Remove the bad charcaters from cleaned annotation treatment column

cleaned.annotation.df <- cleaned.annotation.df %>%
  mutate(treatment = str_replace_all(treatment, "-", ""))

# Select the log counts file for the normalization type of interest

norm.type <- "neg"

if(norm.type == "hk"){
  
  norm.log.counts <- hk_log_counts
  
} else if(norm.type == "neg"){ 
  
  norm.log.counts <- neg_log_counts
  norm.count <- neg.norm_counts
  
} else if(norm.type == "q3"){
  
  norm.log.counts <- q3_log_counts
  
}

# Set up the DGEList object for age groups

norm.log.counts <- norm.log.counts %>% select(-feature)

#rownames(norm.log.counts) <- rownames(norm.counts)

colnames(norm.log.counts) <- gsub("\\.dcc$", "", colnames(norm.log.counts))

annotation.old <- cleaned.annotation.df[cleaned.annotation.df$age == "old", ]
annotation.young <- cleaned.annotation.df[cleaned.annotation.df$age == "young", ]

# Set up the count tables for old versus young

old.sample.IDs <- annotation.old$Sample_ID
young.sample.IDs <- annotation.young$Sample_ID

old.counts <- norm.log.counts[, old.sample.IDs]
young.counts <- norm.log.counts[, young.sample.IDs]

# Create a DGE list object for each age group

DGE.list.old <- DGEList(counts = old.counts, 
                    samples = annotation.old)

DGE.list.young <- DGEList(counts = young.counts, 
                    samples = annotation.young)

# Set up the design matrix for each age group

design.old <- model.matrix(~ 0 + treatment, data = DGE.list.old$samples)
design.young <- model.matrix(~ 0 + treatment, data = DGE.list.young$samples)

# Run the linear model for each age group

fit.old <- lmFit(DGE.list.old$counts, design.old)
fit.young <- lmFit(DGE.list.young$counts, design.young)

# Set up the contrasts of interest for each treatment type

#old
blz.old.contrast <- makeContrasts(treatmentBLZ945 - treatmentcontrol, 
                                  levels = colnames(coef(fit.old)))

ip549.old.contrast <- makeContrasts(treatmentIP549 - treatmentcontrol, 
                                  levels = colnames(coef(fit.old)))

#young
blz.young.contrast <- makeContrasts(treatmentBLZ945 - treatmentcontrol, 
                                  levels = colnames(coef(fit.young)))

ip549.young.contrast <- makeContrasts(treatmentIP549 - treatmentcontrol, 
                                  levels = colnames(coef(fit.young)))


# Estimate contrast

#old
blz.old.contrast.estimate <- contrasts.fit(fit.old, blz.old.contrast)
ip549.old.contrast.estimate <- contrasts.fit(fit.old, ip549.old.contrast)

#young
blz.young.contrast.estimate <- contrasts.fit(fit.young, blz.young.contrast)
ip549.young.contrast.estimate <- contrasts.fit(fit.young, ip549.young.contrast)

# Run Empirical Bayes smoothing of standard errors

#old
fit.blz.old.eb <- eBayes(blz.old.contrast.estimate, robust = TRUE)
fit.ip549.old.eb <- eBayes(ip549.old.contrast.estimate, robust = TRUE)

#young
fit.blz.young.eb <- eBayes(blz.young.contrast.estimate, robust = TRUE)
fit.ip549.young.eb <- eBayes(ip549.young.contrast.estimate, robust = TRUE)

# Results for each contrast

blz.old.results <- topTable(fit.blz.old.eb, sort.by = "P", n=Inf)
ip549.old.results <- topTable(fit.ip549.old.eb, sort.by = "P", n=Inf)
blz.young.results <- topTable(fit.blz.young.eb, sort.by = "P", n=Inf)
ip549.young.results <- topTable(fit.ip549.young.eb, sort.by = "P", n=Inf)

# Create deg lists for all contrasts
all.deg.lists <- list(blz.old.degs = blz.old.results, 
                  ip.old.degs = ip549.old.results, 
                  blz.young.degs = blz.young.results, 
                  ip.young.degs = ip549.young.results)

# Filter for adj p-value and add column for significance level
for(deg.list in names(all.deg.lists)){ 
  all.deg.lists[[deg.list]] <- all.deg.lists[[deg.list]]  %>% 
    filter(adj.P.Val < 0.05) %>% 
    mutate(sig.level = case_when(
      adj.P.Val > 0.01 & adj.P.Val < 0.05 ~ "*",
      adj.P.Val > 0.001 & adj.P.Val <= 0.01 ~ "**",
      adj.P.Val <= 0.001 ~ "***"))
  }


# Write all four DE results to csv files

write.csv(blz.old.results, file.path(paste0(results.folder, "DE/", norm.type,  "_DE_Old_Blz.csv")), 
              row.names = TRUE)

write.csv(ip549.old.results, file.path(paste0(results.folder, "DE/", norm.type,  "_DE_Old_IP549.csv")), 
              row.names = TRUE)

write.csv(blz.young.results, file.path(paste0(results.folder, "DE/", norm.type,  "_DE_Young_Blz.csv")), 
              row.names = TRUE)


write.csv(ip549.young.results, file.path(paste0(results.folder, "DE/", norm.type,  "_DE_Young_IP549.csv")), 
              row.names = TRUE)


```


# Box plots for proteins of interest

```{r}

library(ggplot2)
library(scales)
library(gridExtra)
library(plotly)
library(grid)

# Remove the NTC rows from the annotation
cleaned.annotation.df <- annotation.df[annotation.df$'slide name' != "No Template Control", ]

# Arrange treatment order
treatment.order <- c("control", "IP-549", "BLZ945")
cleaned.annotation.df$treatment <- factor(cleaned.annotation.df$treatment,
                                          levels = treatment.order)

# Boxplot for Protein targets

# Gather all targets
feature.list = rownames(neg_log_counts)

# Set up the normalized counts

for(feature in feature.list){
  
  # Grab the counts
  feature.counts <- neg_log_counts[feature, , drop = FALSE]
  
  # Make the Sample ID as rows
  feature.counts.df <- as.data.frame(t(feature.counts))
  
  # Create a column for the Sample IDs and remove the .dcc
  feature.counts.df$Sample_ID <- rownames(feature.counts.df)
  feature.counts.df$Sample_ID <- gsub("\\.dcc$", "", feature.counts.df$Sample_ID)
  rownames(feature.counts.df) <- NULL
  
  
  # Combine the counts with the annotation based on Sample ID
  feature.boxplot.df <- merge(cleaned.annotation.df, feature.counts.df, by = "Sample_ID")
  
  # Convert counts from characater to numeric
  feature.boxplot.df[[feature]] <- as.numeric(feature.boxplot.df[[feature]])

  # Create the boxplots for the two age groups, then combine into a single image
  
  feature.boxplot.df.old <- feature.boxplot.df[feature.boxplot.df$age == "old", ]
    
  feature.boxplot.df.young <- feature.boxplot.df[feature.boxplot.df$age == "young", ]
  
  y.scale.max <- max(feature.boxplot.df[[feature]]) + 1
  y.scale.min <- min(feature.boxplot.df[[feature]]) - 1
  
  # Set labels at no significance unless changed below
  label.old.blz = "NS"
  label.old.ip = "NS"
  label.young.blz = "NS"
  label.young.ip = "NS"
  
  # Gather the significance level for each contrast
  if(feature %in% rownames(all.deg.lists$blz.old.degs)) { 
    
    label.old.blz <- all.deg.lists$blz.old.degs[feature, "sig.level"]
    
  }
  
  if(feature %in% rownames(all.deg.lists$ip.old.degs)){ 
    
      label.old.ip <- all.deg.lists$ip.old.degs[feature, "sig.level"]
      
  }
  
  if(feature %in% rownames(all.deg.lists$blz.young.degs)){ 
    
      label.young.blz <- all.deg.lists$blz.young.degs[feature, "sig.level"]
  
  }
  
  if(feature %in% rownames(all.deg.lists$ip.young.degs)){ 
    
      label.young.ip <- all.deg.lists$ip.young.degs[feature, "sig.level"]
      
  }
      
      
  feature.boxplot.old <- ggplot(feature.boxplot.df.old, aes(x = treatment, y = !!sym(feature))) + 
    geom_boxplot(notch = FALSE) + 
    ggtitle("Old") + 
    labs(x = "Treatment", y = paste0(feature, " log2 counts")) + 
    scale_y_continuous(labels = scales::comma) + 
    ylim(y.scale.min, y.scale.max) + 
    annotate("text", 
             x = 1.5, 
             y = y.scale.max-0.55, 
             label = label.old.ip, 
             vjust = 1.5, 
             hjust = 0) + 
    annotate("segment", 
             x = 1, 
             xend = 2, 
             y = y.scale.max-0.7, 
             yend = y.scale.max-0.7) + 
    annotate("text", 
             x = 2, 
             y = y.scale.max-0.15, 
             label = label.old.blz, 
             vjust = 1.5, 
             hjust = 0) + 
    annotate("segment", 
             x = 1, 
             xend = 3, 
             y = y.scale.max-0.3, 
             yend = y.scale.max-0.3)
  
  feature.boxplot.young <- ggplot(feature.boxplot.df.young, aes(x = treatment, y = !!sym(feature))) + 
    geom_boxplot(notch = FALSE) + 
    ggtitle("Young") +
    labs(x = "Treatment", y = paste0(feature, " log2 counts")) + 
    scale_y_continuous(labels = scales::comma) + 
    ylim(y.scale.min, y.scale.max) + 
    annotate("text",
             x = 1.5, 
             y = y.scale.max-0.55, 
             label = label.young.ip, 
             vjust = 1.5, 
             hjust = 0) + 
    annotate("segment", 
             x = 1, 
             xend = 2, 
             y = y.scale.max-0.7, 
             yend = y.scale.max-0.7) + 
    annotate("text", 
             x = 2, 
             y = y.scale.max-0.15, 
             label = label.young.blz, 
             vjust = 1.5, 
             hjust = 0) + 
    annotate("segment", 
             x = 1, 
             xend = 3, 
             y = y.scale.max-0.3, 
             yend = y.scale.max-0.3)
  
  feature.combined.boxplot <- grid.arrange(feature.boxplot.young, feature.boxplot.old, ncol = 2)
  
  title <- textGrob(paste0(feature,  " signal by Age and Treatment"), gp = gpar(fontsize = 20, fontface = "bold"))
  
  final.feature.boxplot <- grid.arrange(title, feature.combined.boxplot, heights = c(0.1, 0.9))
  
  # Replace bad characters for export
  if(grepl("/", feature)){
    feature <- gsub("/", "_", feature)
  }
  
  export.boxplot <- TRUE
  
  if(export.boxplot == TRUE){
    ggsave(paste0(project.folder.path, "results/boxplots/neg_normalization_log/", feature, "_boxplot_age_neg_log.png"), final.feature.boxplot, width = 12, height = 10)
  }
  
}

```


# Create a summary table of DEGs

```{r}

library(knitr)
library(kableExtra)
library(dplyr)

norm.type <- "neg"

theme <- ttheme_default()

df_to_grob <- function(df) {
  if (nrow(df) == 0) {
    # Create an empty tableGrob object
    empty_grob <- tableGrob(matrix("", nrow = 1, ncol = ncol(df)), 
                            theme = theme)
    return(empty_grob)
  } else {
    # Convert dataframe to tableGrob
    table_grob <- tableGrob(df, theme = theme)
    return(table_grob)
  }
}

# Separate the up and down regulated genes

# Initialize empty lists to store filtered data frames
old.dfs <- list()
young.dfs <- list()


for(df_name in names(deg.df.list)) {
  # Check if the data frame name starts with "neg"
  if(grepl(norm.type, df_name)) {
    
    if(grepl("old", df_name)) {
      
      if(grepl("blz", df_name)) {
        
        old.blz <- deg.df.list[[df_name]] %>% arrange(desc(log_fold_change))
        
      } else if(grepl("ip", df_name)){
       
        old.ip549 <- deg.df.list[[df_name]] %>% arrange(desc(log_fold_change))
         
      }
      
    } else if(grepl("young", df_name)){
      
      # Sort the data frame based on the sign of log fold change
      if(grepl("blz", df_name)) {
        
        young.blz <- deg.df.list[[df_name]] %>% arrange(desc(log_fold_change))
        
      } else if(grepl("ip", df_name)){
       
        young.ip549 <- deg.df.list[[df_name]] %>% arrange(desc(log_fold_change))
         
      }
    }
  }
}

# Convert data frames to grobs
old_blz_grob <- df_to_grob(old.blz)
old_ip549_grob <- df_to_grob(old.ip549)
young_blz_grob <- df_to_grob(young.blz)
young_ip549_grob <- df_to_grob(young.ip549)

#old_aligned <- gtable_combine(old_blz_grob,old_ip549_grob, along=2)
#young_aligned <- gtable_combine(young_blz_grob,young_ip549_grob, along=2)

#grid.newpage()

#grid.arrange(old_aligned, young_aligned, ncol = 2)


# Arrange grobs into a single table
#combined_table <- grid.arrange(old_blz_grob, 
#                               old_ip549_grob, 
#                               young_blz_grob, 
#                               young_ip549_grob, 
#                               ncol = 2, 
#                               nrow = 2, 
#                               padding = unit(2, "lines"))

# Display the table
grid.newpage()

old_blz_grob <- gtable::gtable_matrix(as.matrix(old.blz))

grid.table(old_blz_grob)


```

## 8. Volcano Plot

#This part is run on NIDAP.

## 9. Violin Plot

```{r Violin Plot, echo=TRUE}
    
    genes <- c("CD274", "CD8A", "CD68", "EPCAM",
         "KRT18", "NPHS1", "NPHS2", "CALB1", "CLDN8")
    
    violin.plot.test <- violinPlot(object = q3.normalization.output$object, 
                                        expr.type = "q_norm", 
                                        genes = genes,
                                        group = "region",
                                        facet.by = "segment")
    grid.arrange(violin.plot.test)

```