diff --git a/.github/switch_sync_repo.R b/.github/switch_sync_repo.R
deleted file mode 100644
index 7975abf..0000000
--- a/.github/switch_sync_repo.R
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env Rscript
-
-# This script switches the repo entry for the yaml file to whatever is specified
-# Written by Candace Savonen Jan 2022
-
-if (!("optparse" %in% installed.packages())){
- install.packages("optparse")
-}
-
-library(optparse)
-
-option_list <- list(
- optparse::make_option(
- c("--repo"),
- type = "character",
- default = "jhudsl/OTTR_Template_Test",
- help = "GitHub repository name, e.g. jhudsl/OTTR_Template_Test",
- )
-)
-
-# Read the arguments passed
-opt_parser <- optparse::OptionParser(option_list = option_list)
-opt <- optparse::parse_args(opt_parser)
-
-# Find .git root directory
-root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
-
-# Get test sync yaml path
-sync_file_path <- file.path(root_dir, ".github", "test-sync.yml")
-
-yaml_contents <- yaml::yaml.load_file(sync_file_path)
-
-# Only keep first grouping
-yaml_contents$group <- yaml_contents$group[[1]]
-
-# Switch out repo
-yaml_contents$group$repos <- opt$repo
-
-yaml::write_yaml(yaml_contents, sync_file_path)
diff --git a/.github/sync.yml b/.github/sync.yml
deleted file mode 100644
index 27510d2..0000000
--- a/.github/sync.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Candace Savonen Aug 2021
-# For info on how to update this file see: https://github.com/marketplace/actions/repo-file-sync-action#%EF%B8%8F-sync-configuration
-
-group:
- - files:
- - source: .github/workflows/
- dest: .github/workflows/
- deleteOrphaned: true
- exclude: |
- send-updates.yml
- test-send-updates.yml
- starting-course.yml
- release-notes.yml
- - source: .github/workflows/delete-preview.yml
- dest: .github/workflows/delete-preview.yml
- - source: .github/workflows/render-site.yml
- dest: .github/workflows/render-site.yml
- - source: .github/workflows/pull_request.yml
- dest: .github/workflows/pull_request.yml
- - source: config_automation.yml
- dest: config_automation.yml
- # Repositories to receive changes
- repos: |
- jhudsl/ottrproject.org
- jhudsl/Baltimore_Community_Course
- maculatus/test-ottr-website
- dr-sayyadhury/OTTR_Template_Website_repo
- whalera1901/Current-projects
- GenetcXBiotech1/Dr.Fierst_lab
- buriedsand/glbio-personal-website
- PurplFeesh/test-ottr-site
- jcha40/ottr_test_site
- jhudsl/ITN_computing_resources
- fhdsl/ITN_course_search
- cansavvy/cansavvy_website
- fhdsl/metricminer-dashboard
- fhdsl/Intermediate_R
-###ADD NEW REPO HERE following the format above#
-
-### These are custom groups for syncing -- not all files needs to be synced # will update later
- # - files:
- # - source: config_automation.yml
- # dest: config_automation.yml
- # - source: .github/workflows/pull-request.yml
- # dest: .github/workflows/pull-request.yml
- # - source: scripts/spell-check.R
- # dest: scripts/spell-check.R
- # repos: |
- # jhudsl/Baltimore_Community_Course
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 0cf82f4..aa5c9e0 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -95,14 +95,27 @@ jobs:
git merge -s recursive --strategy-option=theirs origin/${{ github.head_ref }} --allow-unrelated-histories
shell: bash
+ # Set up / install jq so that json credentials can be read in
+ - name: Install jq
+ uses: dcarbone/install-jq-action@v2.1.0
+
# We want a fresh run of the renders each time - so first delete old html files
- name: Delete old *.html
- run: Rscript -e "rmarkdown::clean_site(preview = FALSE)"
+ run: Rscript -e "rmarkdown::clean_site(input = 'pages', preview = FALSE)"
# Now we want to render all the html files from the Rmd files
- name: Run render html
id: site
- run: Rscript -e "rmarkdown::render_site()"
+ run: |
+ if [ ! -d .secrets ]; then
+ mkdir .secrets
+ fi
+ jq -n '${{ secrets.GS_SA_KEY }}' > .secrets/${{ secrets.GS_SA_FILENAME }}
+ if test -f .secrets/${{ secrets.GS_SA_FILENAME }}; then
+ echo "Key created!"
+ fi
+ Rscript --vanilla "resources/render.R"
+ rm -rf .secrets
# This checks on the steps before it and makes sure that they completed.
# If the renders didn't complete we don't want to commit the file changes
@@ -124,7 +137,7 @@ jobs:
echo "changes=$changes" >> $GITHUB_OUTPUT
git add . --force
git commit -m 'Render preview' || echo "No changes to commit"
- git pull --set-upstream origin $branch_name --allow-unrelated-histories --strategy-option=ours
+ git pull --rebase --set-upstream origin $branch_name --allow-unrelated-histories --strategy-option=ours
git push --force || echo "No changes to commit"
shell: bash
diff --git a/.github/workflows/render-site.yml b/.github/workflows/render-site.yml
index 7f74ea2..023582b 100644
--- a/.github/workflows/render-site.yml
+++ b/.github/workflows/render-site.yml
@@ -52,14 +52,27 @@ jobs:
# use github PAT
token: ${{ secrets.GH_PAT }}
+ # Set up / install jq so that json credentials can be read in
+ - name: Install jq
+ uses: dcarbone/install-jq-action@v2.1.0
+
# We want a fresh run of the renders each time - so first delete old html files
- name: Delete old *.html
- run: Rscript -e "rmarkdown::clean_site(preview = FALSE)"
+ run: Rscript -e "rmarkdown::clean_site(input = 'pages', preview = FALSE)"
# Now we want to render all the html files from the Rmd files
- name: Run render html
id: site
- run: Rscript -e "rmarkdown::render_site()"
+ run: |
+ if [ ! -d .secrets ]; then
+ mkdir .secrets
+ fi
+ jq -n '${{ secrets.GS_SA_KEY }}' > .secrets/${{ secrets.GS_SA_FILENAME }}
+ if test -f .secrets/${{ secrets.GS_SA_FILENAME }}; then
+ echo "Key created!"
+ fi
+ Rscript --vanilla "resources/render.R"
+ rm -rf .secrets
# This checks on the steps before it and makes sure that they completed.
# If the renders didn't complete we don't want to commit the file changes
@@ -77,4 +90,6 @@ jobs:
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git add --force docs/*
git commit -m 'Render site' || echo "No changes to commit"
+ git reset --hard HEAD
+ git pull --rebase --allow-unrelated-histories --strategy-option=ours
git push origin main || echo "No changes to push"
diff --git a/.github/workflows/send-updates.yml b/.github/workflows/send-updates.yml
deleted file mode 100644
index d41b15b..0000000
--- a/.github/workflows/send-updates.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Candace Savonen Aug 2021
-
-name: Sync Files
-
-on:
- release:
- types:
- - published
- workflow_dispatch:
- inputs:
- prtag:
- description: 'Tag to use?'
- required: true
- default: 'null'
-
-jobs:
- sync:
- runs-on: ubuntu-latest
- steps:
- - name: Checkout Repository
- uses: actions/checkout@master
-
- - name: Login as jhudsl-robot
- run: |
- git config --global --add safe.directory $GITHUB_WORKSPACE
- git config --global user.email "itcrtrainingnetwork@gmail.com"
- git config --global user.name "jhudsl-robot"
-
- - name: Get the version
- id: get_tag
- run: |
- if [ github.event.inputs.prtag == 'null' ]
- then
- echo "version=$(echo $GITHUB_REF | cut -d / -f 3)" >> $GITHUB_OUTPUT
- fi
- if [ github.event.inputs.prtag != 'null' ]
- then
- echo "version=${{ github.event.inputs.prtag }}" >> $GITHUB_OUTPUT
- fi
-
- - name: Run Mechanics File Sync
- uses: BetaHuhn/repo-file-sync-action@v1.17.21
- with:
- GH_PAT: ${{ secrets.GH_PAT }}
- COMMIT_BODY: release-${{ steps.get_tag.outputs.version }}
diff --git a/.github/workflows/test-send-updates.yml b/.github/workflows/test-send-updates.yml
deleted file mode 100644
index 8c688eb..0000000
--- a/.github/workflows/test-send-updates.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Candace Savonen Aug 2021
-
-name: Test Sync Files
-
-on:
- workflow_dispatch:
- inputs:
- repo:
- description: 'What repo to test on e.g. jhudsl/OTTR_Template_Test'
- required: true
- default: 'jhudsl/OTTR_Template_Test'
-
-jobs:
- test-sync:
- runs-on: ubuntu-latest
- container:
- image: jhudsl/base_ottr:main
-
- steps:
- - name: Checkout Repository
- uses: actions/checkout@master
-
- - name: Login as jhudsl-robot
- run: |
- git config --system --add safe.directory "$GITHUB_WORKSPACE"
- git config --local user.email "itcrtrainingnetwork@gmail.com"
- git config --local user.name "jhudsl-robot"
-
- - name: Set up test-sync.yml
- run: |
- cp .github/sync.yml .github/test-sync.yml
- # Switch out repo argument
- Rscript --vanilla .github/switch_sync_repo.R --repo ${{ github.event.inputs.repo }}
-
- - name: Run Mechanics File Sync
- uses: BetaHuhn/repo-file-sync-action@v1.17.21
- with:
- GH_PAT: ${{ secrets.GH_PAT }}
- COMMIT_BODY: "test-run"
- CONFIG_PATH: .github/test-sync.yml
diff --git a/.gitignore b/.gitignore
index 5b6a065..bc47b05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@
.Rhistory
.RData
.Ruserdata
+.secrets/*
+.DS_Store
+*/.DS_Store
\ No newline at end of file
diff --git a/IdentifyTypeOfUsers.Rmd b/IdentifyTypeOfUsers.Rmd
new file mode 100644
index 0000000..8fb66ba
--- /dev/null
+++ b/IdentifyTypeOfUsers.Rmd
@@ -0,0 +1,52 @@
+---
+title: "Identify current vs potential users"
+author: ""
+date: ""
+output: html_document
+---
+
+```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE}
+library(tidyverse)
+library(here)
+
+knitr::knit_child("TidyData.Rmd") #inherit resultsTidy
+```
+
+```{r, message=FALSE, echo=FALSE}
+resultsTidy %>%
+ group_by(UserType, CurrentUsageDescription) %>%
+ summarize(count = n()) %>%
+ mutate(CurrentUsageDescription = case_when(
+ CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "For ongoing projects:\nconsistent project development\nand/or work",
+ CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "For completed/long-term projects:\noccasional updates/maintenance\nas needed",
+ CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "For short-term projects:\nshort, intense bursts\nseparated by a few months",
+ CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "I do not current use the AnVIL,\nbut have in the past",
+ CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "I have never\nused the AnVIL",
+ CurrentUsageDescription == "I have never heard of the AnVIL" ~ "I have never\nheard of the AnVIL"
+ )) %>%
+ ggplot(aes(x = count, y = reorder(CurrentUsageDescription, count), fill = UserType)) +
+ scale_fill_manual(values = c("#E0DD10", "#035C94")) +
+ geom_bar(stat="identity", position ="stack") +
+ theme_classic() +
+ xlab("Count") +
+ ylab("") +
+ ggtitle("How would you describe your current usage\nof the AnVIL platform?") +
+ geom_text(aes(label = count, group = CurrentUsageDescription),
+ hjust = -0.5, size=2) +
+ theme(legend.title = element_blank())
+```
+
+## Takeaway
+
+Of the ```r nrow(resultsTidy)``` responses, ```r nrow(resultsTidy %>% filter(UserType == "Current User"))``` were current users and ```r nrow(resultsTidy %>% filter(UserType == "Potential User"))``` were potential users. The majority of current users belonged to the group who use the AnVIL for ongoing projects while the majority of potential users were evenly split between those who have never used the AnVIL (but have heard of it) and those who used to previously use the AnVIL, but don't currently.
+
+## Potential Follow-ups:
+
+- Look to see if those potential users who previously used to use the AnVIL show similarity in overall trends with the rest of the potential users
+- Directly ask why they no longer use the AnVIL
+
+Description of variable definitions and steps in preparing and plotting the data
+
+First, we group the data by the assigned `UserType` labels/categories and their related more detailed descriptions. Then we use `summarize` to count the occurrences for each of those categories. We use a mutate statement to better fit the detailed descriptions on the plot. We then send this data to ggplot with the count on the x-axis, and the usage descriptions on the y-axis (ordered by count so highest count is on the top). We fill with the user type description we've assigned. We manually scale the fill to be AnVIL colors and specify we want this to be a stacked bar chart. We then make edits for the theme and labels and finally add a geom_text label for the count next to the bars before we save the plot.
+
+
diff --git a/README.md b/README.md
index 223e26c..6eedf80 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,19 @@
-# OTTR for Websites!
+# Analysis of the State of the AnVIL 2024 Poll
+Analysis of the State of the AnVIL 2024 Poll
-Get started by going to [ottrproject.org](https://www.ottrproject.org/getting_started.html)!
+## Demographics
-This is a template for creating websites from Rmd files hosted on GitHub with three helpful automations following a pull request to the repository: spelling check, broken link check, and website rendering.
+### Institutional affiliation
-- Check for spelling errors more intensively than RStudio and allow you to add words to the dictionary
-- Check for broken links - you will be warned about broken links
-- Automatic rendering of the website for previewing before merges
-- Automatic rendering of the website upon merging to main
-- Docker images that can be customized.
+![Institutional affiliation](https://github.com/fhdsl/anvilPoll2024/blob/3c6a05514355bce61033481470f940645928b83e/plots/institutionalType_faceteduserType.png)
+### Highest Degree attained
+![Highest Degree attained](plots/degree_usertype.png)
+
+### Genomics and Clinical Research Experience
+![Research Experience](https://github.com/fhdsl/anvilPoll2024/blob/9178a9e2ca527eab98b8caeb3b31346e916113ab/plots/researchExperienceLevel_colorExperienceLevel_noUserTypeSplit.png)
+
+## Insights
+
+### Feature importance for current vs potential users
+![Feature importance dumbbell plot](plots/dumbbellplot_xlim16_rankfeatures.png)
diff --git a/TidyData.Rmd b/TidyData.Rmd
new file mode 100644
index 0000000..2d37a14
--- /dev/null
+++ b/TidyData.Rmd
@@ -0,0 +1,610 @@
+---
+title: "Tidy Data"
+author: "Kate Isaac, Elizabeth Humphries, & Ava Hoffman"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+```{r, message=FALSE}
+library(googlesheets4)
+library(tidyverse)
+library(magrittr) #for %<>%
+library(here)
+library(grid) #for Grobs and unit()
+library(scales) #pretty breaks
+```
+
+# Read in data
+
+Data were read in via a Google Sheet on the AnVIL Team Drive.
+
+Import details
+The google sheet we are reading in is stored in an AnVIL Google drive folder `State of the AnVIL 2024`. Its permissions are restricted such that only people with access can open with the link. Using `gs4_auth()` to authorize my google account before running this code, I needed to change the `scopes` argument, specifically `scopes=spreadsheets.readonly` was necessary.
+
+In this google sheet, each question is a column, and each response to the survey is a row. If the respondent wasn't asked or didn't answer a specific question, there is an NA in the corresponding row/column.
+
+```{r, eval=FALSE, echo=FALSE}
+gs4_auth(email = TRUE)
+```
+
+```{r, echo=FALSE, message=FALSE}
+resultsRaw <-
+ googlesheets4::read_sheet(
+ "https://docs.google.com/spreadsheets/d/1wDMNC6BD2AaIwh_GOkPTpl1tvAyLwVBQgAvOD2rYrX0/edit?usp=sharing",
+ na = c("NA", "na", ""))
+```
+
+
+
+# Clean data
+
+**Note:** Every code block in this section edits the `resultsTidy` data frame and should be run before plotting within the `# Insights` section below. Subsections are marked according to which Insight they are related to, but cleaning steps like identifying the user type are important for most every plot.
+
+## Set Column Names
+
+We set the column names to simplified column names (e.g., that help us select related columns for various analyses) by reading in a codebook (`data/codebook.txt`).
+
+Simplifying column names details
+
+Description of variable definitions and steps
+
+We have a codebook that is a tab delimited file and has 4 columns, and each row represents a question in the survey. The first column lists a/the question from the survey (`SurveyColNames`); the second column lists a corresponding simplified column name for that survey question (`SimplifedColNames`); the third column describes the variable format (`VariableFormat`), e.g, is it a double, or a character; the fourth column gives a lengthier description of the question (`Description`), e.g., who was asked it, what possible answers are, etc.
+
+This code block reads in that codebook and specifically selects the `SimplifiedColNames` column. It then renames the column names of the raw results from the google sheet (where each question is a column) with these simplified column names.
+
+
+
+```{r, message=FALSE}
+simplifiedColNames <-
+ read_delim(here("data/codebook.txt"),
+ delim = "\t",
+ col_select = SimplifiedColNames)
+resultsTidy <-
+ resultsRaw %>% `colnames<-`(unlist(simplifiedColNames))
+```
+
+
+## Keep last response if duplicated according to email (if email provided)
+
+Choosing to select the last response because the respondent may have spent more time thinking about how they wanted to respond after their initial response.
+
+Filtering duplicated responses details
+
+Description of variable definitions and steps
+
+* The `table` function tabulates the number of occurrences, and we tell it to ignore literal NAs. Because providing an email was optional, we expect many NA responses. The `table` function, by ignoring NAs, will return the unique emails and the number of times each email was used. We store the tabulated results in the variable `tabulatedEmails`
+* Using the `sum` function, we look to see how many emails/responses are provided more than once. `tabulatedEmails > 1` is returning a vector of TRUEs and FALSEs where TRUE means that there was more than one instance/count of a given email and FALSE means there wasn't. The `sum` function in essence counts the number of TRUEs and if the `sum` is greater than 0, that means there is at least one duplicated email whose count is greater than 1.
+* `duplicatedEmails` reports which emails are duplicated by using the tabulated/table of emails. First it identifies which emails were observed more than once, using the `which` function, and uses the indices returned from that to index the `names` of the tabulated emails, grabbing the specific emails.
+* We want to know which entries from the overall survey responses to remove for each duplicated email. Ideally, we want to remove the responses all at the same time or go backwards removing one at a time, because we don't want to affect downstream indices. The approach here, keeps track of all the indices of interest and removed them at the same time.
+ * Therefore, we'll use `lapply` to loop through the duplicated emails (`duplicatedEmails`) and grab the index for survey responses associated with that email address (`which(resultsTidy$Email == duplicatedEmails[x])`).
+ * However, we want to keep the last survey response for each duplicated email. Therefore, we wrap that `which` function in `head(_,-1 )` function so that it grabs all indices except the last one.
+ * Finally, we `unlist` the indices so that there's a single vector associated with indices for any duplicated email responses to be removed `IDXs_to_remove`. And since we want to remove them all at the same time, we subset `resultsTidy`, grabbing every row except those in `IDXs_to_remove`, as denoted by the `-`.
+
+
+
+```{r}
+
+tabulatedEmails <- table(resultsTidy$Email, useNA = "no")
+
+if (sum(tabulatedEmails > 1) > 0) {
+ duplicatedEmails <-
+ names(tabulatedEmails)[which(tabulatedEmails > 1)]
+ IDXs_to_remove <-
+ unlist(lapply(1:length(duplicatedEmails), function(x)
+ head(
+ which(resultsTidy$Email == duplicatedEmails[x]),-1
+ )))
+ resultsTidy <- resultsTidy[-IDXs_to_remove, ]
+}
+
+nrow(resultsTidy)
+```
+
+
+
+## Identify type of user
+
+The first question of the poll asks respondents to describe their current usage of the AnVIL and allows us to categorize respondents as potential or current users of the AnVIL.
+
+Question and possible answers
+
+> How would you describe your current usage the AnVIL platform?
+
+Possible answers include:
+
+* For completed/long-term projects (e.g., occasional updates/maintenance as needed)
+* For ongoing projects (e.g., consistent project development and/or work)
+* For short-term projects (e.g., short, intense bursts separated by a few months)
+* I do no currently use the AnVIL, but have in the past
+* I have never heard of the AnVIL
+* I have never used the AnVIL, but have heard of it.
+
+The first three possible answers represent current or returning AnVIL users. The last three possible answers represent potential AnVIL users.
+
+
+
+Identifying user type details
+
+Description of variable definitions and steps
+
+We use `case_when` to evaluate the response in the `CurrentUsageDescription` column and assign a corresponding, simplified label of "Current User" or "Potential User'. In other words we translate the given response to a user label. Using the `case_when` as the internal nested function of the `mutate` function, means that the translation is then saved in a new column, `UserType`.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ UserType = case_when(
+ CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "Current User",
+ CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "Current User",
+ CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "Current User",
+ CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "Potential User",
+ CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "Potential User",
+ CurrentUsageDescription == "I have never heard of the AnVIL" ~ "Potential User"
+ )
+ ) %>%
+ mutate(UserType = factor(UserType, levels = c("Potential User", "Current User")))
+```
+
+
+
+## Institutional Affiliation: Synchronize Institution Names
+
+Users were able to disclose their institutional affiliation using a free text response, therefore we needed to synchronize institution names (example: Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses) and added simplified affiliation categories ([R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown] and [Research Intensive, Education Focused, and Industry & Other]). The first level of affiliation categories are notated in an institution specific codebook (`data/institution_codebook.txt`)
+
+Question and possible answers
+
+> What institution are you affiliated with?
+
+Free response for answers
+
+
+
+Institutional affiliation syncronizations details
+
+This synchronization corrects for the various spellings and capitalizations used for the same institution (ex, Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses).
+
+Description of variable definitions and steps
+
+We use a `recode()` within a `mutate()` to synchronize the institutional affiliations as necessary
+
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ InstitutionalAffiliation =
+ recode(
+ InstitutionalAffiliation,
+ "Broad" = "Broad Institute",
+ "broad institute" = "Broad Institute",
+ "CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" = "City University of New York",
+ "harvard" = "Harvard University",
+ "Harvard Public Health" = "Harvard University",
+ "Johns hopkins" = "Johns Hopkins",
+ "Johns Hopkins University" = "Johns Hopkins",
+ "OHSU" = "Oregon Health & Science University",
+ "OHSU (Knight Center)" = "Oregon Health & Science University",
+ "The Ohio State University" = "Ohio State University",
+ "UCSC" = "University of California Santa Cruz",
+ "univ. ca. santa cruz" = "University of California Santa Cruz",
+ "university of California santa cruz" = "University of California Santa Cruz",
+ "UMASS Chan Medical School" = "UMass Chan Medical School",
+ "Umass Chan Medical School" = "UMass Chan Medical School",
+ "Washington University in St Louis" = "Washington University in St. Louis",
+ "yikongene" = "Yikon Genomics",
+ "v" = "Unknown"
+ )
+ )
+```
+
+Elizabeth Humphries grouped institutional affiliations into a limited set of categories: R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown and we notated those groupings/labels within the `institution_codebook.txt` data file, . Grouping into limited institutional affiliation categories allows us to consolidate free answers for easier data visualization and identification of trends.
+
+Description of variable definitions and steps
+
+We use a `read_delim()` to read in the institution_codebook file, and select just the `InstitutionalAffiliation` and `InstitutionalType` columns (ignoring the column that specifies how institutions were entered by survey respondents). We then use a full_join by the `InstitutionalAffiliation` column to add an `InstitutionalType` column such that the category labels are now included as a new column, joining the appropriate values dependent upon the `InstitutionalAffiliation` column.
+
+
+
+```{r, message = FALSE}
+institutionCodeBook <- read_delim(here("data/institution_codebook.txt"), delim="\t", col_select = c(InstitutionalAffiliation, InstitutionalType))
+
+resultsTidy <- full_join(resultsTidy, institutionCodeBook, by = "InstitutionalAffiliation")
+```
+
+Here we even further simplify Institutional Affiliations to focus on Research Intensive, Education Focused, and Industry & Other
+
+This groups R1 University, Research Center, Medical Center or School, and NIH as "Research Intensive"; R2 University & Community College as "Education Focused"; and Industry, International Location, or Unknown as "Industry & Other".
+
+```{r}
+resultsTidy %<>%
+ mutate(FurtherSimplifiedInstitutionalType =
+ case_when(
+ InstitutionalType == "R1 University" ~ "Research Intensive",
+ InstitutionalType == "Research Center" ~ "Research Intensive",
+ InstitutionalType == "Medical Center or School" ~ "Research Intensive",
+ InstitutionalType == "NIH" ~ "Research Intensive",
+ InstitutionalType == "R2 University" ~ "Education Focused",
+ InstitutionalType == "Community College" ~ "Education Focused",
+ InstitutionalType == "Industry" ~ "Industry & Other",
+ InstitutionalType == "International Location" ~ "Industry & Other",
+ InstitutionalType == "Unknown" ~ "Industry & Other"
+ )
+ )
+```
+
+
+
+## Highest degree attained
+
+This question allowed more than one response, however, only one response selected two (PhD, MD), which we recoded to be MD/PhD. We simplify the possible responses to group attained or in progress degrees
+
+
+Question and possible answers
+
+> What is the highest degree you have attained?
+
+Possible answers include (and multiple choices could be selected and would be comma separated if so)
+
+* High school or equivalent
+* Bachelor's degree
+* Master's degree in progress
+* Master's degree
+* PhD in progress
+* PhD
+* MD in progress
+* MD
+* Other (with free text entry)
+
+
+
+Degree recoding details
+
+Description of variable definitions and steps
+
+Because multiple responses could be selected and those would be comma separated and because free text response was possible if other was selected, we need to tidy the data from this question. From visual inspection of the data, I see that the only time multiple responses were selected were for MD/PhD. No other's were selected. So we'll just recode "PhD, MD" to be "MD/PhD"
+
+Let's also set the factor levels to follow the general progress of degrees
+
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ Degrees =
+ factor(recode(Degrees, "PhD, MD" = "MD/PhD"), levels = c("High School or equivalent", "Bachelor's degree", "Master's degree in progress", "Master's degree", "PhD in progress", "PhD", "MD in progress", "MD", "MD/PhD")),
+ FurtherSimplifiedDegrees = recode(Degrees,
+ "Master's degree in progress" = "Master's degree (or in progress)",
+ "Master's degree" = "Master's degree (or in progress)",
+ "PhD in progress" = "PhD (or in progress)",
+ "PhD" = "PhD (or in progress)",
+ "MD/PhD" = "MD (MD, MD/PhD, or in progress)",
+ "MD in progress" = "MD (MD, MD/PhD, or in progress)",
+ "MD" = "MD (MD, MD/PhD, or in progress)"
+ )
+ )
+```
+
+
+
+## Tool Knowledge and Comfort Separate from the AnVIL and on the AnVIL
+
+We want to recode these responses to set the factor level/progression from Don't know it, not at all comfortable, all the way to extremely comfortable and make corresponding integer comfort scores.
+
+Question and possible answers
+
+>How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)?
+>How would you rate your knowledge of or comfort with these technologies (on the AnVIL)?
+>How would you rate your knowledge of or comfort with these AnVIL data features?
+Shared technologies between these two questions include
+
+* Jupyter Notebooks: `CurrentAnVILTechJupyterNotebooks` & `AllTechJupyterNotebooks`
+* Bioconductor & RStudio: `CurrentAnVILTechRStudio` & `AllTechRStudio` + `AllTechBioconductor`
+* Galaxy: `CurrentAnVILTechGalaxy` & `AllTechGalaxy`
+* WDL Workflows / Workflows (e.g., WDL): `CurrentAnVILTechWDL` & `AllTechWorkflows`
+* Containers: `CurrentAnVILTechContainers` & `AllTechContainers`
+* Unix / Command Line: `CurrentAnVILTechCommandLine` & `AllTechCommandLine`
+
+Technologies only asked separate from the AnVIL
+
+* Python: `AllTechPython`
+* R: `AllTechR`
+
+Technologies/data features only asked with regards to the AnVIL
+
+* Accessing controlled access datasets: `CurrentAnVILTechAccessData`
+* DUOS (Data Use Oversight System): `CurrentAnVILTechDUOS`
+* Terra on AnVIL (Workspaces): `CurrentAnVILTechTerra`
+* TDR (Terra Data Repository): `CurrentAnVILTechTDR`
+
+Possible answers for each of these questions include
+
+* Don't know it (0)
+* Not at all comfortable (1)
+* Slightly comfortable (2)
+* Somewhat comfortable (3)
+* Moderately comfortable (4)
+* Extremely comfortable (5)
+
+Notated possible "comfort scores" in parentheses next to each possible answer. We'll add these as additional columns that now start with the word "Score_" but otherwise retain the column name, in case it's helpful to still have the words (whose factor level we'll set to reflect the progression of knowledge/comfort).
+
+Responses are NA if the question wasn't asked to the survey taker (e.g., they were a potential user and weren't asked about technologies with regards to the AnVIL)
+
+
+
+Cleaning Comfort level/scores for various technologies and resources details
+
+It's likely that someone who's a program administrator will select don't know for these.... should we remove them and see how average scores change?
+
+Description of variable definitions and steps
+
+We select the relevant columns (those that start with "CurrentAnVILTech" or "AllTech") we want to work with. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined.
+
+After the `unnest` we can use the `mutate` function to first work with these as factors (to set the progression we want from don't know it all the way to extremely comfortable) and then to make the replacements specified above for an integer score in place of the comfort level, placing these scores in new columns with names that begin with "Score_" and fill in the rest of the column name with the corresponding original column name.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(across(starts_with(c(
+ "CurrentAnVILTech", "AllTech"
+ )), as.character)) %>%
+ unnest(starts_with(c("CurrentAnVILTech", "AllTech")), keep_empty = TRUE) %>%
+ mutate(across(starts_with(c(
+ "CurrentAnVILTech", "AllTech"
+ )), ~ parse_factor(
+ .,
+ levels = c(
+ "Don't know it",
+ "Not at all comfortable",
+ "Slightly comfortable",
+ "Somewhat comfortable",
+ "Moderately comfortable",
+ "Extremely comfortable"
+ )
+ ))) %>%
+ mutate(across(
+ starts_with(c("CurrentAnVILTech", "ALLTech")),
+ ~ case_when(
+ . == "Don't know it" ~ 0,
+ . == "Not at all comfortable" ~ 1,
+ . == "Slightly comfortable" ~ 2,
+ . == "Somewhat comfortable" ~ 3,
+ . == "Moderately comfortable" ~ 4,
+ . == "Extremely comfortable" ~ 5
+ )
+ ,
+ .names = "Score_{.col}"
+ ))
+```
+
+
+
+## Feature importance: Comparisons of rank of importance of features/resources between Current Users and Potential Users
+
+We want to recode these responses to remove labels and make them integers.
+
+Question and possible answers
+
+>Rank the following features or resources according to their importance for your continued use of the AnVIL
+
+>Rank the following features or resources according to their importance to you as a potential user of the AnVIL?
+
+* Easy billing setup
+* Flat-rate billing rather than use-based
+* Free version with limited compute or storage
+* On demand support and documentation
+* Specific tools or datasets are available/supported
+* Greater adoption of the AnVIL by the scientific community
+
+We're going to look at a comparison of the assigned ranks for these features, comparing between current users and potential users.
+
+
+
+Cleaning/recoding the feature importance ranks details
+
+Description of variable definitions and steps
+
+We can use `starts_with` to select these columns, specifically focusing on the starts with "PotentialRank" and "CurrentRank". When we made simplified names for the columns, these are the only twelve that start like that.
+
+Either the 6 CurrentRank or the 6 PotentialRank were asked to each survey taker which means that we expect NULL values in these columns since not every survey taker will have answered all of these questions.
+
+We want to recode the following values
+
+* Replace 1 (Most important in this list) with 1
+* Replace 6 (Least important in this list) with 6
+
+Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined.
+
+After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_`
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(across(starts_with(c(
+ "PotentialRank", "CurrentRank"
+ )), as.character)) %>%
+ unnest(starts_with(c("PotentialRank", "CurrentRank")), keep_empty = TRUE) %>%
+ mutate(across(
+ starts_with(c("PotentialRank", "CurrentRank")),
+ ~ recode(
+ .x,
+ "1 (Most important in this list)" = "1",
+ "6 (Least important in this list)" = "6",
+ "NULL" = NA_character_
+ )
+ )) %>%
+ mutate(across(starts_with(c(
+ "PotentialRank", "CurrentRank"
+ )), as.integer))
+```
+
+
+
+## Training Modality Preference
+
+We want to recode these responses to remove labels and make them integers.
+
+Question and possible answers
+
+>Please rank how/where you would prefer to attend AnVIL training workshops.
+
+Possible answers include
+
+* On-site at my institution: `AnVILTrainingWorkshopsOnSite`
+* Virtual: `AnVILTrainingWorkshopsVirtual`
+* Conference (e.g., CSHL, AMIA): `AnVILTrainingWorkshopsConference`
+* AnVIL-specific event: `AnVILTrainingWorkshopsSpecEvent`
+* Other: `AnVILTrainingWorkshopsOther`
+
+The responses are stored in the starts with `AnVILTrainingWorkshops` columns
+
+
+
+Cleaning the training modality ranks details
+
+Description of variable definitions and steps
+
+We can use `starts_with` to select these columns, specifically focusing on the starts with "AnVILTrainingWorkshops". These are the only 5 that start like that when we made simplified column names.
+
+We want to recode the following values
+
+* Replace 1 (Most preferred in this list) with 1
+* Replace 5 (Least preferred in this list) with 5
+
+Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves any NULL values, but first we have to use an `as.character` type change before the `unnest`, otherwise, we get an error that double and character values can't be combined.
+
+After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_`
+
+
+
+```{r}
+
+resultsTidy %<>%
+ mutate(across(starts_with(
+ "AnVILTrainingWorkshops"), as.character)) %>%
+ unnest(starts_with("AnVILTrainingWorkshops"), keep_empty = TRUE) %>%
+ mutate(across(
+ starts_with("AnVILTrainingWorkshops"),
+ ~ recode(
+ .x,
+ "1 (Most preferred in this list)" = "1",
+ "5 (Least preferred in this list)" = "5",
+ "NULL" = NA_character_
+ )
+ )) %>%
+ mutate(across(starts_with("AnVILTrainingWorkshop"), as.integer))
+
+```
+
+
+
+## Simplified experience status for various research categories (clinical, human genomics, non-human genomics)
+
+Want to add three columns that act as flags reporting if the respondent is
+
+* experienced with clinical research, specifically either moderately or extremely experienced in working with human clinical data
+* experienced with human genomics research, specifically is moderately or extremely experienced in working with human genomics data
+* experienced with non-human genomics research expert, specifically is moderately or extremely experienced in working with non-human genomics data
+
+We will use this information later to subset responses when considering popular tools or datasets.
+
+Question and possible answers
+
+>How much experience do you have analyzing the following data categories?
+
+The three research categories people are asked about include
+
+* Human Genomic
+* Non-human Genomic
+* Human Clinical
+
+Possible answers include
+
+* Not at all experienced
+* Slightly experienced
+* Somewhat experienced
+* Moderately experienced
+* Extremely experienced.
+
+
+
+Setting research category experience flag details
+
+Description of variable definitions and steps
+
+We use a `mutate` together with 3 `case_when`'s.
+
+* If the `HumanClinicalExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human clinical research expert in the `clinicalFlag` column (`TRUE`). Otherwise, we mark a `FALSE` to signify they are not a clinical research expert.
+* If the `HumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human genomic research expert in the `humanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert.
+* If the `NonHumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a non-human genomic research expert in the `nonHumanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ clinicalFlag = case_when(
+ HumanClinicalExperience == "Moderately experienced" | HumanClinicalExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE
+ ),
+ humanGenomicFlag = case_when(
+ HumanGenomicExperience == "Moderately experienced" | HumanGenomicExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE
+ ),
+ nonHumanGenomicFlag = case_when(NonHumanGenomicExperience == "Moderately experienced" | NonHumanGenomicExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE)
+ )
+```
+
+
+
+## AnVIL Demo Attendance, Awareness, and Utilization
+
+The question asked was pretty granular in describing attendance, use, and awareness of AnVIL Demos. We we want to simplify each possible answer to a binary version of aware of/not aware of or used/have not used.
+
+Question and possible answers
+
+> Have you attended a monthly AnVIL Demo?
+
+Possible answers include
+
+* Yes, multiple
+* Yes, one
+* Not yet, but am registered to
+* No, but aware of
+* No, didn't know of
+
+
+
+AnVIL Demo recoding details
+
+Description of variable definitions and steps
+
+
+
+```{r, message = FALSE}
+resultsTidy %<>%
+ mutate(AnVILDemo = factor(AnVILDemo, levels = c("Yes, multiple", "Yes, one", "Not yet, but am registered to", "No, but aware of", "No, didn't know of")),
+ AnVILDemoAwareness = factor(case_when(
+ AnVILDemo == "Yes, multiple" ~ "Aware of",
+ AnVILDemo == "Yes, one" ~ "Aware of",
+ AnVILDemo == "Not yet, but am registered to" ~ "Aware of",
+ AnVILDemo == "No, but aware of" ~ "Aware of",
+ AnVILDemo == "No, didn't know of" ~ "Not Aware of"
+ ), levels = c("Not Aware of", "Aware of")),
+ AnVILDemoUse = factor(case_when(
+ AnVILDemo == "Yes, multiple" ~ "Have/will utilize",
+ AnVILDemo == "Yes, one" ~ "Have/will utilize",
+ AnVILDemo == "Not yet, but am registered to" ~ "Have/will utilize",
+ AnVILDemo == "No, but aware of" ~ "Have not utilized",
+ AnVILDemo == "No, didn't know of" ~ "Have not utilized"
+ ), levels = c("Have not utilized", "Have/will utilize"))
+)
+
+```
+
+
diff --git a/_site.yml b/_site.yml
deleted file mode 100644
index 7f07604..0000000
--- a/_site.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: OTTR Template Website
-output_dir: 'docs'
-navbar:
- title: OTTR Web
- left:
- - text: ""
- href: index.html
- icon: fa-home
- - text: 1. Setup
- href: setup.html
- - text: 2. Hosting
- href: hosting.html
- - text: Editing
- href: editing.html
- - text: 3. Style
- href: style.html
- - text: 4. Git Actions
- href: git_actions.html
- - text: More ottr docs
- href: https://www.ottrproject.org/
-
-
-output:
- html_document:
- theme: cosmo
- lib_dir: site_libs
- self_contained: no
- highlight: textmate
- css: styles.css
- includes:
- in_header: resources/header.html
diff --git a/OTTR_Template_Website.Rproj b/anvilPoll2024.Rproj
similarity index 75%
rename from OTTR_Template_Website.Rproj
rename to anvilPoll2024.Rproj
index 628359e..8e3c2eb 100644
--- a/OTTR_Template_Website.Rproj
+++ b/anvilPoll2024.Rproj
@@ -9,9 +9,5 @@ UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
-RnwWeave: knitr
+RnwWeave: Sweave
LaTeX: pdfLaTeX
-
-AutoAppendNewline: Yes
-
-BuildType: Website
diff --git a/anvilPoll2024MainAnalysis.Rmd b/anvilPoll2024MainAnalysis.Rmd
new file mode 100644
index 0000000..c44a507
--- /dev/null
+++ b/anvilPoll2024MainAnalysis.Rmd
@@ -0,0 +1,1296 @@
+---
+title: "State of the AnVIL 2024"
+subtitle: "Main analysis"
+author: "Kate Isaac, Elizabeth Humphries, & Ava Hoffman"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+```{r, message=FALSE}
+library(googlesheets4)
+library(tidyverse)
+library(magrittr) #for %<>%
+library(here)
+library(grid) #for Grobs and unit()
+```
+
+# Read in data
+
+Data were read in via a Google Sheet on the AnVIL Team Drive.
+
+Import details
+The google sheet we are reading in is stored in an AnVIL Google drive folder `State of the AnVIL 2024`. Its permissions are restricted such that only people with access can open with the link. Using `gs4_auth()` to authorize my google account before running this code, I needed to change the `scopes` argument, specifically `scopes=spreadsheets.readonly` was necessary.
+
+In this google sheet, each question is a column, and each response to the survey is a row. If the respondant wasn't asked or didn't answer a specific question, there is an NA in the corresponding row/column.
+
+```{r, echo=FALSE, message=FALSE}
+gs4_auth(email = "kathryn.j.isaac@gmail.com", scopes="spreadsheets.readonly")
+resultsRaw <-
+ googlesheets4::read_sheet(
+ "https://docs.google.com/spreadsheets/d/1wDMNC6BD2AaIwh_GOkPTpl1tvAyLwVBQgAvOD2rYrX0/edit?usp=sharing",
+ na = c("NA", "na", ""))
+```
+
+
+
+# Clean data
+
+**Note:** Every code block in this section edits the `resultsTidy` data frame and should be run before plotting within the `# Insights` section below. Subsections are marked according to which Insight they are related to, but cleaning steps like identifying the user type are important for most every plot.
+
+## Set Column Names
+
+We set the column names to simplified column names (e.g., that help us select related columns for various analyses) by reading in a codebook (`data/codebook.txt`).
+
+Simplifying column names details
+
+Description of variable definitions and steps
+
+We have a codebook that is a tab delimited file and has 4 columns, and each row represents a question in the survey. The first column lists a/the question from the survey (`SurveyColNames`); the second column lists a corresponding simplified column name for that survey question (`SimplifedColNames`); the third column describes the variable format (`VariableFormat`), e.g, is it a double, or a character; the fourth column gives a lengthier description of the question (`Description`), e.g., who was asked it, what possible answers are, etc.
+
+This code block reads in that codebook and specifically selects the `SimplifiedColNames` column. It then renames the column names of the raw results from the google sheet (where each question is a column) with these simplified column names.
+
+
+
+```{r, message=FALSE}
+simplifiedColNames <-
+ read_delim(here("data/codebook.txt"),
+ delim = "\t",
+ col_select = SimplifiedColNames)
+resultsTidy <-
+ resultsRaw %>% `colnames<-`(unlist(simplifiedColNames))
+```
+
+
+## Keep last response if duplicated according to email (if email provided)
+
+Choosing to select the last response because the respondent may have spent more time thinking about how they wanted to respond after their initial response.
+
+Filtering duplicated responses details
+
+Description of variable definitions and steps
+
+* The `table` function tabulates the number of occurrences, and we tell it to ignore literal NAs. Because providing an email was optional, we expect many NA responses. The `table` function, by ignoring NAs, will return the unique emails and the number of times each email was used. We store the tabulated results in the variable `tabulatedEmails`
+* Using the `sum` function, we look to see how many emails/responses are provided more than once. `tabulatedEmails > 1` is returning a vector of TRUEs and FALSEs where TRUE means that there was more than one instance/count of a given email and FALSE means there wasn't. The `sum` function in essence counts the number of TRUEs and if the `sum` is greater than 0, that means there is at least one duplicated email whose count is greater than 1.
+* `duplicatedEmails` reports which emails are duplicated by using the tabulated/table of emails. First it identifies which emails were observed more than once, using the `which` function, and uses the indices returned from that to index the `names` of the tabulated emails, grabbing the specific emails.
+* We want to know which entries from the overall survey responses to remove for each duplicated email. Ideally, we want to remove the responses all at the same time or go backwards removing one at a time, because we don't want to affect downstream indices. The approach here, keeps track of all the indices of interest and removed them at the same time.
+ * Therefore, we'll use `lapply` to loop through the duplicated emails (`duplicatedEmails`) and grab the index for survey responses associated with that email address (`which(resultsTidy$Email == duplicatedEmails[x])`).
+ * However, we want to keep the last survey response for each duplicated email. Therefore, we wrap that `which` function in `head(_,-1 )` function so that it grabs all indices except the last one.
+ * Finally, we `unlist` the indices so that there's a single vector associated with indices for any duplicated email responses to be removed `IDXs_to_remove`. And since we want to remove them all at the same time, we subset `resultsTidy`, grabbing every row except those in `IDXs_to_remove`, as denoted by the `-`.
+
+
+
+```{r}
+
+tabulatedEmails <- table(resultsTidy$Email, useNA = "no")
+
+if (sum(tabulatedEmails > 1) > 0) {
+ duplicatedEmails <-
+ names(tabulatedEmails)[which(tabulatedEmails > 1)]
+ IDXs_to_remove <-
+ unlist(lapply(1:length(duplicatedEmails), function(x)
+ head(
+ which(resultsTidy$Email == duplicatedEmails[x]),-1
+ )))
+ resultsTidy <- resultsTidy[-IDXs_to_remove, ]
+}
+
+nrow(resultsTidy)
+```
+
+
+
+## Identify type of user
+
+The first question of the poll asks respondents to describe their current usage of the AnVIL and allows us to categorize respondents as potential or current users of the AnVIL.
+
+Question and possible answers
+
+> How would you describe your current usage the AnVIL platform?
+
+Possible answers include:
+
+* For completed/long-term projects (e.g., occasional updates/maintenance as needed)
+* For ongoing projects (e.g., consistent project development and/or work)
+* For short-term projects (e.g., short, intense bursts separated by a few months)
+* I do no currently use the AnVIL, but have in the past
+* I have never heard of the AnVIL
+* I have never used the AnVIL, but have heard of it.
+
+The first three possible answers represent current or returning AnVIL users. The last three possible answers represent potential AnVIL users.
+
+
+
+Identifying user type details
+
+Description of variable definitions and steps
+
+We use `case_when` to evaluate the response in the `CurrentUsageDescription` column and assign a corresponding, simplified label of "CurrentUser" or "PotentialUser'. In other words we translate the given response to a user label. Using the `case_when` as the internal nested function of the `mutate` function, means that the translation is then saved in a new column, `UserType`.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ UserType = case_when(
+ CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "CurrentUser",
+ CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "CurrentUser",
+ CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "CurrentUser",
+ CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "PotentialUser",
+ CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "PotentialUser",
+ CurrentUsageDescription == "I have never heard of the AnVIL" ~ "PotentialUser"
+ )
+ ) %>%
+ mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser")))
+```
+
+
+
+## Institutional Affiliation: Synchronize Institution Names
+
+Users were able to disclose their institutional affiliation using a free text response, therefore we needed to synchronize institution names (example: Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses) and added simplified affiliation categories ([R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown] and [Research Intensive, Education Focused, and Industry & Other]). The first level of affiliation categories are notated in an institution specific codebook (`data/institution_codebook.txt`)
+
+Question and possible answers
+
+> What institution are you affiliated with?
+
+Free response for answers
+
+
+
+Institutional affiliation syncronizations details
+
+This synchronization corrects for the various spellings and capitalizations used for the same institution (ex, Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses).
+
+Description of variable definitions and steps
+
+We use a `recode()` within a `mutate()` to synchronize the institutional affiliations as necessary
+
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ InstitutionalAffiliation =
+ recode(
+ InstitutionalAffiliation,
+ "Broad" = "Broad Institute",
+ "broad institute" = "Broad Institute",
+ "CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" = "City University of New York",
+ "harvard" = "Harvard University",
+ "Harvard Public Health" = "Harvard University",
+ "Johns hopkins" = "Johns Hopkins",
+ "Johns Hopkins University" = "Johns Hopkins",
+ "OHSU" = "Oregon Health & Science University",
+ "OHSU (Knight Center)" = "Oregon Health & Science University",
+ "The Ohio State University" = "Ohio State University",
+ "UCSC" = "University of California Santa Cruz",
+ "univ. ca. santa cruz" = "University of California Santa Cruz",
+ "university of California santa cruz" = "University of California Santa Cruz",
+ "UMASS Chan Medical School" = "UMass Chan Medical School",
+ "Umass Chan Medical School" = "UMass Chan Medical School",
+ "Washington University in St Louis" = "Washington University in St. Louis",
+ "yikongene" = "Yikon Genomics",
+ "v" = "Unknown"
+ )
+ )
+```
+
+Elizabeth Humphries grouped institutional affiliations into a limited set of categories: R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown and we notated those groupings/labels within the `institution_codebook.txt` data file, . Grouping into limited institutional affiliation categories allows us to consolidate free answers for easier data visualization and identification of trends.
+
+Description of variable definitions and steps
+
+We use a `read_delim()` to read in the institution_codebook file, and select just the `InstitutionalAffiliation` and `InstitutionalType` columns (ignoring the column that specifies how institutions were entered by survey respondents). We then use a full_join by the `InstitutionalAffiliation` column to add an `InstitutionalType` column such that the category labels are now included as a new column, joining the appropriate values dependent upon the `InstitutionalAffiliation` column.
+
+
+
+```{r, message = FALSE}
+institutionCodeBook <- read_delim(here("data/institution_codebook.txt"), delim="\t", col_select = c(InstitutionalAffiliation, InstitutionalType))
+
+resultsTidy <- full_join(resultsTidy, institutionCodeBook, by = "InstitutionalAffiliation")
+```
+
+Here we even further simplify Institutional Affiliations to focus on Research Intensive, Education Focused, and Industry & Other
+
+This groups R1 University, Research Center, Medical Center or School, and NIH as "Research Intensive"; R2 University & Community College as "Education Focused"; and Industry, International Location, or Unknown as "Industry & Other".
+
+```{r}
+resultsTidy %<>%
+ mutate(FurtherSimplifiedInstitutionalType =
+ case_when(
+ InstitutionalType == "R1 University" ~ "Research Intensive",
+ InstitutionalType == "Research Center" ~ "Research Intensive",
+ InstitutionalType == "Medical Center or School" ~ "Research Intensive",
+ InstitutionalType == "NIH" ~ "Research Intensive",
+ InstitutionalType == "R2 University" ~ "Education Focused",
+ InstitutionalType == "Community College" ~ "Education Focused",
+ InstitutionalType == "Industry" ~ "Industry & Other",
+ InstitutionalType == "International Location" ~ "Industry & Other",
+ InstitutionalType == "Unknown" ~ "Industry & Other"
+ )
+ )
+```
+
+
+
+## Highest degree attained
+
+This question allowed more than one response, however, only one response selected two (PhD, MD), which we recoded to be MD/PhD. We simplify the possible responses to group attained or in progress degrees
+
+
+Question and possible answers
+
+> What is the highest degree you have attained?
+
+Possible answers include (and multiple choices could be selected and would be comma separated if so)
+
+* High school or equivalent
+* Bachelor's degree
+* Master's degree in progress
+* Master's degree
+* PhD in progress
+* PhD
+* MD in progress
+* MD
+* Other (with free text entry)
+
+
+
+Degree recoding details
+
+Description of variable definitions and steps
+
+Because multiple responses could be selected and those would be comma separated and because free text response was possible if other was selected, we need to tidy the data from this question. From visual inspection of the data, I see that the only time multiple responses were selected were for MD/PhD. No other's were selected. So we'll just recode "PhD, MD" to be "MD/PhD"
+
+Let's also set the factor levels to follow the general progress of degrees
+
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ Degrees =
+ factor(recode(Degrees, "PhD, MD" = "MD/PhD"), levels = c("High School or equivalent", "Bachelor's degree", "Master's degree in progress", "Master's degree", "PhD in progress", "PhD", "MD in progress", "MD", "MD/PhD")),
+ FurtherSimplifiedDegrees = recode(Degrees,
+ "Master's degree in progress" = "Master's degree (or in progress)",
+ "Master's degree" = "Master's degree (or in progress)",
+ "PhD in progress" = "PhD (or in progress)",
+ "PhD" = "PhD (or in progress)",
+ "MD/PhD" = "MD (MD, MD/PhD, or in progress)",
+ "MD in progress" = "MD (MD, MD/PhD, or in progress)",
+ "MD" = "MD (MD, MD/PhD, or in progress)"
+ )
+ )
+```
+
+
+
+## Tool Knowledge and Comfort Separate from the AnVIL and on the AnVIL
+
+We want to recode these responses to set the factor level/progression from Don't know it, not at all comfortable, all the way to extremely comfortable and make corresponding integer comfort scores.
+
+Question and possible answers
+
+>How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)?
+>How would you rate your knowledge of or comfort with these technologies (on the AnVIL)?
+>How would you rate your knowledge of or comfort with these AnVIL data features?
+Shared technologies between these two questions include
+
+* Jupyter Notebooks: `CurrentAnVILTechJupyterNotebooks` & `AllTechJupyterNotebooks`
+* Bioconductor & RStudio: `CurrentAnVILTechRStudio` & `AllTechRStudio` + `AllTechBioconductor`
+* Galaxy: `CurrentAnVILTechGalaxy` & `AllTechGalaxy`
+* WDL Workflows / Workflows (e.g., WDL): `CurrentAnVILTechWDL` & `AllTechWorkflows`
+* Containers: `CurrentAnVILTechContainers` & `AllTechContainers`
+* Unix / Command Line: `CurrentAnVILTechCommandLine` & `AllTechCommandLine`
+
+Technologies only asked separate from the AnVIL
+
+* Python: `AllTechPython`
+* R: `AllTechR`
+
+Technologies/data features only asked with regards to the AnVIL
+
+* Accessing controlled access datasets: `CurrentAnVILTechAccessData`
+* DUOS (Data Use Oversight System): `CurrentAnVILTechDUOS`
+* Terra on AnVIL (Workspaces): `CurrentAnVILTechTerra`
+* TDR (Terra Data Repository): `CurrentAnVILTechTDR`
+
+Possible answers for each of these questions include
+
+* Don't know it (0)
+* Not at all comfortable (1)
+* Slightly comfortable (2)
+* Somewhat comfortable (3)
+* Moderately comfortable (4)
+* Extremely comfortable (5)
+
+Notated possible "comfort scores" in parentheses next to each possible answer. We'll add these as additional columns that now start with the word "Score_" but otherwise retain the column name, in case it's helpful to still have the words (whose factor level we'll set to reflect the progression of knowledge/comfort).
+
+Responses are NA if the question wasn't asked to the survey taker (e.g., they were a potential user and weren't asked about technologies with regards to the AnVIL)
+
+
+
+Cleaning Comfort level/scores for various technologies and resources details
+
+It's likely that someone who's a program administrator will select don't know for these.... should we remove them and see how average scores change?
+
+Description of variable definitions and steps
+
+We select the relevant columns (those that start with "CurrentAnVILTech" or "AllTech") we want to work with. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined.
+
+After the `unnest` we can use the `mutate` function to first work with these as factors (to set the progression we want from don't know it all the way to extremely comfortable) and then to make the replacements specified above for an integer score in place of the comfort level, placing these scores in new columns with names that begin with "Score_" and fill in the rest of the column name with the corresponding original column name.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(across(starts_with(c(
+ "CurrentAnVILTech", "AllTech"
+ )), as.character)) %>%
+ unnest(starts_with(c("CurrentAnVILTech", "AllTech")), keep_empty = TRUE) %>%
+ mutate(across(starts_with(c(
+ "CurrentAnVILTech", "AllTech"
+ )), ~ parse_factor(
+ .,
+ levels = c(
+ "Don't know it",
+ "Not at all comfortable",
+ "Slightly comfortable",
+ "Somewhat comfortable",
+ "Moderately comfortable",
+ "Extremely comfortable"
+ )
+ ))) %>%
+ mutate(across(
+ starts_with(c("CurrentAnVILTech", "ALLTech")),
+ ~ case_when(
+ . == "Don't know it" ~ 0,
+ . == "Not at all comfortable" ~ 1,
+ . == "Slightly comfortable" ~ 2,
+ . == "Somewhat comfortable" ~ 3,
+ . == "Moderately comfortable" ~ 4,
+ . == "Extremely comfortable" ~ 5
+ )
+ ,
+ .names = "Score_{.col}"
+ ))
+```
+
+
+
+## Feature importance: Comparisons of rank of importance of features/resources between Current Users and Potential Users
+
+We want to recode these responses to remove labels and make them integers.
+
+Question and possible answers
+
+>Rank the following features or resources according to their importance for your continued use of the AnVIL
+
+>Rank the following features or resources according to their importance to you as a potential user of the AnVIL?
+
+* Easy billing setup
+* Flat-rate billing rather than use-based
+* Free version with limited compute or storage
+* On demand support and documentation
+* Specific tools or datasets are available/supported
+* Greater adoption of the AnVIL by the scientific community
+
+We're going to look at a comparison of the assigned ranks for these features, comparing between current users and potential users.
+
+### Recode rank values
+
+Description of variable definitions and steps
+
+Columns of interest include
+
+* PotentialRankEasyBillingSetup
+* PotentialRankFlatRateBilling
+* PotentialRankFreeVersion
+* PotentialRankSupportDocs
+* PotentialRankToolsData
+* PotentialRankCommunityAdoption
+* CurrentRankEasyBillingSetup
+* CurrentRankFlatRateBilling
+* CurrentRankFreeVersion
+* CurrentRankSupportDocs
+* CurrentRankToolsData
+* CurrentRankCommunityAdoption
+
+
+
+Cleaning the feature importance ranks details
+
+Description of variable definitions and steps
+
+We can use `starts_with` to select these columns, specifically focusing on the starts with "PotentialRank" and "CurrentRank". When we made simplified names for the columns, these are the only twelve that start like that.
+
+Either the 6 CurrentRank or the 6 PotentialRank were asked to each survey taker which means that we expect NULL values in these columns since not every survey taker will have answered all of these questions.
+
+We want to recode the following values
+
+* Replace 1 (Most important in this list) with 1
+* Replace 6 (Least important in this list) with 6
+
+Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined.
+
+After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_`
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(across(starts_with(c(
+ "PotentialRank", "CurrentRank"
+ )), as.character)) %>%
+ unnest(starts_with(c("PotentialRank", "CurrentRank")), keep_empty = TRUE) %>%
+ mutate(across(
+ starts_with(c("PotentialRank", "CurrentRank")),
+ ~ recode(
+ .x,
+ "1 (Most important in this list)" = "1",
+ "6 (Least important in this list)" = "6",
+ "NULL" = NA_character_
+ )
+ )) %>%
+ mutate(across(starts_with(c(
+ "PotentialRank", "CurrentRank"
+ )), as.integer))
+```
+
+
+
+## Training Modality Preference
+
+We want to recode these responses to remove labels and make them integers.
+
+Question and possible answers
+
+>Please rank how/where you would prefer to attend AnVIL training workshops.
+
+Possible answers include
+
+* On-site at my institution: `AnVILTrainingWorkshopsOnSite`
+* Virtual: `AnVILTrainingWorkshopsVirtual`
+* Conference (e.g., CSHL, AMIA): `AnVILTrainingWorkshopsConference`
+* AnVIL-specific event: `AnVILTrainingWorkshopsSpecEvent`
+* Other: `AnVILTrainingWorkshopsOther`
+
+The responses are stored in the starts with `AnVILTrainingWorkshops` columns
+
+
+
+Cleaning the training modality ranks details
+
+Description of variable definitions and steps
+
+We can use `starts_with` to select these columns, specifically focusing on the starts with "AnVILTrainingWorkshops". These are the only 5 that start like that when we made simplified column names.
+
+We want to recode the following values
+
+* Replace 1 (Most preferred in this list) with 1
+* Replace 5 (Least preferred in this list) with 5
+
+Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves any NULL values, but first we have to use an `as.character` type change before the `unnest`, otherwise, we get an error that double and character values can't be combined.
+
+After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_`
+
+
+
+```{r}
+
+resultsTidy %<>%
+ mutate(across(starts_with(
+ "AnVILTrainingWorkshops"), as.character)) %>%
+ unnest(starts_with("AnVILTrainingWorkshops"), keep_empty = TRUE) %>%
+ mutate(across(
+ starts_with("AnVILTrainingWorkshops"),
+ ~ recode(
+ .x,
+ "1 (Most preferred in this list)" = "1",
+ "5 (Least preferred in this list)" = "5",
+ "NULL" = NA_character_
+ )
+ )) %>%
+ mutate(across(starts_with("AnVILTrainingWorkshop"), as.integer))
+
+```
+
+
+
+## Simplified experience status for various research categories (clinical, human genomics, non-human genomics)
+
+Want to add three columns that act as flags reporting if the respondent is
+
+* experienced with clinical research, specifically either moderately or extremely experienced in working with human clinical data
+* experienced with human genomics research, specifically is moderately or extremely experienced in working with human genomics data
+* experienced with non-human genomics research expert, specifically is moderately or extremely experienced in working with non-human genomics data
+
+We will use this information later to subset responses when considering popular tools or datasets.
+
+Question and possible answers
+
+>How much experience do you have analyzing the following data categories?
+
+The three research categories people are asked about include
+
+* Human Genomic
+* Non-human Genomic
+* Human Clinical
+
+Possible answers include
+
+* Not at all experienced
+* Slightly experienced
+* Somewhat experienced
+* Moderately experienced
+* Extremely experienced.
+
+
+
+Setting research category experience flag details
+
+Description of variable definitions and steps
+
+We use a `mutate` together with 3 `case_when`'s.
+
+* If the `HumanClinicalExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human clinical research expert in the `clinicalFlag` column (`TRUE`). Otherwise, we mark a `FALSE` to signify they are not a clinical research expert.
+* If the `HumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human genomic research expert in the `humanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert.
+* If the `NonHumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a non-human genomic research expert in the `nonHumanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ clinicalFlag = case_when(
+ HumanClinicalExperience == "Moderately experienced" | HumanClinicalExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE
+ ),
+ humanGenomicFlag = case_when(
+ HumanGenomicExperience == "Moderately experienced" | HumanGenomicExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE
+ ),
+ nonHumanGenomicFlag = case_when(NonHumanGenomicExperience == "Moderately experienced" | NonHumanGenomicExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE)
+ )
+```
+
+
+
+# Insights
+
+## Identify type of user
+
+**Takeaway:** Of the ```r nrow(resultsTidy)``` responses, ```r nrow(resultsTidy %>% filter(UserType == "CurrentUser"))``` were current users and ```r nrow(resultsTidy %>% filter(UserType == "PotentialUser"))``` were potential users. The majority of current users belonged to the group who use the AnVIL for ongoing projects while the majority of potential users were evenly split between those who have never used the AnVIL (but have heard of it) and those who used to previously use the AnVIL, but don't currently.
+
+**Potential Follow-ups:**
+
+- Look to see if those potential users who previously used to use the AnVIL show similarity in overall trends with the rest of the potential users
+- Directly ask why they no longer use the AnVIL (Elizabeth mentioned the possibility that the AnVIL is sometimes used in courses or workshops and students may not use it after that)
+
+### Prepare and plot the data
+
+Description of variable definitions and steps
+
+First, we group the data by the assigned UserType labels/categories and their related more detailed descriptions. Then we use `summarize` to count the occurrences for each of those categories. We use a mutate statement to better fit the detailed descriptions on the plot. We then send this data to ggplot with the count on the x-axis, and the usage descriptions on the y-axis (ordered by count so highest count is on the top). We fill with the `UserType` description we've assigned. We manually scale the fill to be AnVIL colors and specify we want this to be a stacked bar chart. We then make edits for the theme and labels and finally add a geom_text label for the count next to the bars before we save the plot.
+
+
+
+```{r, message=FALSE, echo=FALSE}
+resultsTidy %>%
+ group_by(UserType, CurrentUsageDescription) %>%
+ summarize(count = n()) %>%
+ mutate(CurrentUsageDescription = case_when(
+ CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "For ongoing projects:\nconsistent project development\nand/or work",
+ CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "For completed/long-term projects:\noccasional updates/maintenance\nas needed",
+ CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "For short-term projects:\nshort, intense bursts\nseparated by a few months",
+ CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "I do not current use the AnVIL,\nbut have in the past",
+ CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "I have never\nused the AnVIL",
+ CurrentUsageDescription == "I have never heard of the AnVIL" ~ "I have never\nheard of the AnVIL"
+ )) %>%
+ ggplot(aes(x = count, y = reorder(CurrentUsageDescription, count), fill = UserType)) +
+ scale_fill_manual(values = c("#E0DD10", "#035C94")) +
+ geom_bar(stat="identity", position ="stack") +
+ theme_classic() +
+ xlab("Count") +
+ ylab("Current Usage Description") +
+ ggtitle("How would you describe your current usage\nof the AnVIL platform?") +
+ geom_text(aes(label = count, group = CurrentUsageDescription),
+ hjust = -0.5, size=2)
+
+ggsave(here("plots/respondent_usagedescription.png"))
+```
+
+
+## Demographics: Institutional Affiliation
+
+**Takeaway:**
+
+### Prepare and plot the data
+
+```{r, message=FALSE, echo = FALSE}
+resultsTidy %>%
+ mutate(FurtherSimplifiedInstitutionalType = factor(FurtherSimplifiedInstitutionalType, levels = c("Industry & Other", "Education Focused", "Research Intensive"))) %>%
+ group_by(UserType, FurtherSimplifiedInstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = FurtherSimplifiedInstitutionalType,
+ x = InstitutionalCount,
+ fill = UserType
+ )) + geom_bar(position = "stack", stat = "identity") +
+ theme_classic() +
+ geom_text(
+ aes(label = after_stat(x), group = FurtherSimplifiedInstitutionalType),
+ stat = 'summary', fun = sum, hjust = -1, size=2
+ ) +
+ ylab("") +
+ xlab("Count") +
+ ggtitle("Institutional Affiliation for All Survey Respondents") +
+ annotation_custom(textGrob("- R1 University \n- Med Campus \n- Research Center\n- NIH ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = 2.65, ymax = 2.65) +
+ annotation_custom(textGrob("- Industry \n- International Loc\n- Unknown ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = .7, ymax = .7) +
+ annotation_custom(textGrob("- R2 University \n- Community College", gp=gpar(fontsize=8)),xmin=-8.5,xmax=-8.5,ymin=1.75,ymax=1.75) +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("#E0DD10", "#035C94")) +
+ ggtitle("What institution are you affiliated with?")
+ggsave(here("plots/institutionalType_simplified_allResponses_colorUserType.png"))
+```
+
+## Demographics: Highest Degree Attained
+
+**Takeaway:**
+
+### Prepare and plot the data
+
+Description of variable definitions and steps
+
+First we use `group_by()` to select`Degrees` and `UserType` in conjunction with `summarize( = n())` to add counts for how many of each combo are observed in the data.
+
+Then we send this data to ggplot and make a bar chart with the x-axis representing the degrees (`reorder`ed by the count number such that higher counts are first (and the sum) because otherwise the 2 MDs are located after the high school and master's in progress bars (1 each)). The y-axis represents the count, and the fill is used to specify user type (current or potential AnVIL users). We use a stacked bar chart and include labels above each bar of the total sum for that degree type.
+
+Used [this Stack Overflow post to label sums above the bars](https://stackoverflow.com/questions/30656846/draw-the-sum-value-above-the-stacked-bar-in-ggplot2)
+
+and used [this Stack Overflow post to remove NA from the legend](https://stackoverflow.com/questions/45493163/ggplot-remove-na-factor-level-in-legend)
+
+The rest of the changes are related to theme and labels and making sure that the numerical bar labels aren't cut off on the top.
+
+
+
+```{r, message=FALSE, echo=FALSE}
+
+resultsTidy %>%
+ group_by(FurtherSimplifiedDegrees, UserType) %>%
+ summarize(n = n()) %>%
+ ggplot(aes(y = reorder(FurtherSimplifiedDegrees, n, sum),
+ x = n,
+ fill = UserType
+ )) +
+ geom_bar(position = "stack", stat="identity") +
+ geom_text(
+ aes(label = after_stat(x), group = FurtherSimplifiedDegrees),
+ stat = 'summary', fun = sum, hjust = -1, size=2
+ ) +
+ theme_classic() +
+ ylab("Degree") +
+ xlab("Count") +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("#E0DD10", "#035C94"), na.translate = F) +
+ ggtitle("What is the highest degree you have attained?")
+
+ggsave(here("plots/degree_furthersimplified_usertype.png"))
+```
+
+## Experience: Genomics and Clinical Research Experience
+
+**Takeaway:**
+
+### Prepare and plot the data
+
+Description of variable definitions and steps for preparing the data
+
+Here we select the columns containing answers for each data category: `HumanGenomicExperience`, `HumanClinicalExperience`, and `NonHumanGenomicExperience`. We also select `UserType` in case we want to split user type out at all in viewing the data. We use a `pivot_longer` to make a long dataframe that can be grouped and groups counted. The category/column names go to a new column, `researchType` and the values in those columns go to a new column `experienceLevel`. Before we use group by and count, we set the factor level on the new `experienceLevel` column to match the progression from not at all experienced to extremely experienced, and we rename the research categories so that the words have spaces, and we say research instead of experience. Then we use `group_by` and `summarize` to add counts for each combination of research category, experience level, and `UserType`. These counts are in the new `n` column.
+
+
+
+```{r, message=FALSE, echo=FALSE}
+experienceDf <- resultsTidy %>% select(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience, UserType) %>%
+ pivot_longer(c(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience), names_to = "researchType", values_to = "experienceLevel") %>%
+ mutate(experienceLevel =
+ factor(experienceLevel, levels = c("Not at all experienced", "Slightly experienced", "Somewhat experienced", "Moderately experienced", "Extremely experienced")),
+ researchType = case_when(researchType == "HumanClinicalExperience" ~ "Human Clinical Research",
+ researchType == "HumanGenomicExperience" ~ "Human Genomic Research",
+ researchType == "NonHumanGenomicExperience" ~ "Non-human\nGenomic Research")) %>%
+ group_by(researchType, experienceLevel, UserType) %>% summarize(n = n())
+```
+
+Description of variable definitions and steps for plotting the bar graph
+
+We didn't observe big differences between current and potential users, so we believe this grouped plot is useful for understanding the community as a whole.
+
+This bar plot has the experience level on the x-axis, the count on the y-axis, and fills the bars according to the experience level (though the fill/color legend is turned off by setting legend.position to none). We facet the research category type and label the bars. We keep a summary stat and sum function and after_stat(y) for the label since the data has splits like `UserType` that we're not visualizing here.
+
+We adjust various aspects of the theme like turning off the grid and background and rotating the x-tick labels and changing the x- and y-axis labels. We also slightly widen the left axis so that the tick labels aren't cut off.
+
+
+
+```{r, message=FALSE, echo = FALSE}
+ggplot(experienceDf, aes(x=experienceLevel,y=n, fill = experienceLevel)) +
+ facet_grid(~researchType) +
+ geom_bar(stat="identity") +
+ theme_bw() +
+ theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle = 45, hjust=1)) +
+ geom_text(
+ aes(label = after_stat(y), group = experienceLevel),
+ stat = 'summary', fun = sum, vjust = -0.5, size=2
+) +
+ ylab("Count") + xlab ("Reported Experience Level") +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.05, "cm")) +
+ scale_fill_manual(values = c("#035C94","#035385","#024A77","#024168", "#02395B")) +
+ theme(legend.position = "none")+
+ ggtitle("How much experience do you have analyzing the following data categories?")
+
+
+ggsave(here("plots/researchExperienceLevel_sequentialColor_noUserTypeSplit.png"))
+```
+
+## Experience: Controlled Access Datasets
+
+**Takeaway:**
+
+Question and possible answers
+
+>What large, controlled access datasets do you access or would you be interested in accessing using the AnVIL?
+
+* All of Us*
+* Centers for Common Disease Genomics (CCDG)
+* The Centers for Mendelian Genomics (CMG)
+* Clinical Sequencing Evidence-Generating Research (CSER)
+* Electronic Medical Records and Genomics (eMERGE)
+* Gabriella Miller Kids First (GMKF)
+* Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR)
+* The Genotype-Tissue Expression Project (GTEx)
+* The Human Pangenome Reference Consortium (HPRC)
+* Population Architecture Using Genomics and Epidemiology (PAGE)
+* Undiagnosed Disease Network (UDN)
+* UK Biobank*
+* None
+* Other (Free Text Response)
+
+Since this is a select all that apply question, we expect that there will be multiple responses that are comma separated. The free text responses will likely need recoded as well. The responses are in the `AccessWhichControlledData` column.
+
+
+
+### Prepare and plot the data
+
+Description of variable definitions and steps for preparing the data
+
+Making a function `prep_df_whichData()` since we'll be using this workflow a few times for different subsets of the data, because we want to be able to differentially display the data based on the experience status (experienced with clinical research, human genomics research, etc.) of the person saying they'd like access to the data.
+
+We want to color the bars based on whether or not the controlled access dataset is available on the AnVIL currently. We create a dataframe `onAnVILDF` to report this. Used the [AnVIL dataset catalog/browser](https://explore.anvilproject.org/datasets) to find out this information. However, HPRC and GREGoR don't show up in that resource, but are both available per these sources: [Announcement for HPRC](https://anvilproject.org/news/2021/03/11/hprc-on-anvil), [Access for HPRC](https://anvilproject.org/data/consortia/HPRC), [Access for GREGoR](https://anvilproject.org/data/consortia/GREGoR). Both GMKF and TCGA are data hosted on other NCPI platforms that are accessible via AnVIL because of interoperability. (See: https://www.ncpi-acc.org/ and https://ncpi-data.org/platforms). We list these as non-AnVIL hosted since while accessible, they are not AnVIL hosted and inaccessible without NCPI. Finally, UDN is described as non-AnVIL hosted as it is in the Data submission pipeline and not yet available.
+
+We'll join this anvil-hosted or not data with the actual data at the end.
+
+Given the input `subset_df`, we expect several answer to be comma separated. Since there are 12 set possible responses (not including "None") and one possible free response answer, we separate the `AccessWhichControlledData` column into 13 columns ("WhichA" through "WhichN"), separating on a comma (specifically a ", " a comma followed by a space, otherwise there were duplicates where the difference was a leading space). Alternative approaches should [consider using `str_trim`](https://stringr.tidyverse.org/reference/str_trim.html). We set fill to "right" but this shouldn't really matter. It's just to suppress the unnecessary warning that they're adding NA's when there aren't 13 responses. If there's only one response, it'll put that response in `WhichA` and fill the rest of them with `NA`. If there's two responses, it'll put those two responses in `WhichA` and `WhichB` and fill the rest of them with `NA`... etc,
+
+We then use `pivot_longer` to grab these columns we just made and put the column names in a new column `WhichChoice` and the values in the each column to a new column `whichControlledAccess`. We drop all the NAs in this new `whichControlledAccess` column (and there's a lot of them there)...
+
+Then we group by the new `whichControlledAccess` column and summarize a count for how many there are for each response.
+
+Then we pass this to a mutate and recode function to simplify the fixed responses to be just their acronyms, to remove asterisks (that let the survey respondent know that that dataset wasn't available because of policy restrictions), and to recode the free text responses (details below in "Notes on free text response recoding").
+
+We use a `left_join()` to join the cleaned data with a dataframe that specifies whether that dataset is currently available on the AnVIL or not. It's a left join rather than a full join so it's only adding the annotation for datasets that are available in the results.
+
+Finally, we return this subset and cleaned dataframe so that it can be plotted.
+
+
+
+ Additional notes on free text response recoding
+
+There were 4 "Other" free response responses
+
+* "Being able to pull other dbGap data as needed."
+ --> We recoded this to be an "Other"
+* "GnomAD and ClinVar"
+ --> GnomAD and ClinVar are not controlled access datasets so we recoded that response to be "None"
+* "Cancer omics datasets"
+ --> We recoded this to be an "Other"
+* "TCGA"
+ --> This response was left as is since there is a controlled access tier.
+
+
+
+```{r, message = FALSE, echo = FALSE}
+onAnVILDF <- read_delim(here("data/controlledAccessData_codebook.txt"), delim = "\t", col_select = c(whichControlledAccess, AnVIL_Availability))
+
+prep_df_whichData <- function(subset_df, onAnVILDF = onAnVILDF){
+ subset_df %<>% separate(AccessWhichControlledData, c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG", "WhichH", "WhichI", "WhichJ", "WhichK", "WhichM", "WhichN"), sep = ", ", fill="right") %>%
+ pivot_longer(starts_with("Which"), names_to = "WhichChoice", values_to = "whichControlledAccess") %>%
+ drop_na(whichControlledAccess) %>%
+ group_by(whichControlledAccess) %>% summarize(count = n()) %>%
+ mutate(whichControlledAccess =
+ recode(whichControlledAccess,
+ "All of Us*" = "All of Us",
+ "UK Biobank*" = "UK Biobank",
+ "Centers for Common Disease Genomics (CCDG)" = "CCDG",
+ "The Centers for Mendelian Genomics (CMG)" = "CMG",
+ "Clinical Sequencing Evidence-Generating Research (CSER)" = "CSER",
+ "Electronic Medical Records and Genomics (eMERGE)" = "eMERGE",
+ "Gabriella Miller Kids First (GMKF)" = "GMKF",
+ "Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR)" = "GREGoR",
+ "The Genotype-Tissue Expression Project (GTEx)" = "GTEx",
+ "The Human Pangenome Reference Consortium (HPRC)" = "HPRC",
+ "Population Architecture Using Genomics and Epidemiology (PAGE)" = "PAGE",
+ "Undiagnosed Disease Network (UDN)" = "UDN",
+ "Being able to pull other dbGap data as needed." = "Other",
+ "Cancer omics datasets" = "Other",
+ "GnomAD and ClinVar" = "None", #not controlled access
+ )
+ ) %>% left_join(onAnVILDF, by="whichControlledAccess")
+
+ return(subset_df)
+}
+
+```
+Description of variable definitions and steps for preparing the data continued
+
+Here we set up 4 data frames for plotting
+
+* The first uses all of the responses and sends them through the `prep_df_whichData()` function to clean the data for plotting to see which controlled access datasets are the most popular.
+* The second filters to grab just the responses from those experienced in clinical research using the `clinicalFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection)
+* The third filters to grab just the responses from those experienced in human genomic research using the `humanGenomicFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection)
+* The fourth filters to grab just the responses from those experienced in non-human genomic research using the `nonHumanGenomicFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection)
+
+
+
+```{r, message=FALSE, echo = FALSE}
+whichDataDf <- resultsTidy %>% prep_df_whichData(onAnVILDF = onAnVILDF)
+
+whichDataClinicalSubset <- resultsTidy %>%
+ filter(clinicalFlag == TRUE) %>%
+ prep_df_whichData(onAnVILDF = onAnVILDF)
+
+whichDataHumanGenomicSubset <- resultsTidy %>%
+ filter(humanGenomicFlag == TRUE) %>%
+ prep_df_whichData(onAnVILDF = onAnVILDF)
+
+whichDataNonHumanGenomicSubset <- resultsTidy %>%
+ filter(nonHumanGenomicFlag == TRUE) %>%
+ prep_df_whichData(onAnVILDF = onAnVILDF)
+
+```
+
+Description of variable definitions and steps for plotting the bar graphs
+
+Also have a function here because it's the same plotting steps for each just changing the subtitle and which dataframe is used as input.
+
+This takes the input dataframe and plots a bar plot with the x-axis having the controlled access datasets listed (reordering the listing based off of the count so most popular is on the left), the count number/popularity of requested is on the y-axis, and the fill is based on whether the dataset is available on AnVIL or not.
+
+We change the theme elements like removing panel borders, panel background, and panel grid, and rotate the x-axis tick labels. We add an x- and y- axis label and add a title (and subtitle if specified - which it will be when we're looking at just a subset like those who are experienced with clinical data)
+
+We also add text labels above the bars to say how many times each dataset was marked/requested. Note that we have to use the after_stat, summary, and sum way of doing it again because we use recoding and if we want the labels to be accurate, it has to capture every time we've recoded things to be the same after we used group_by and summarize to count before we recoded. It uses `coord_cartesian(clip = "off")` so these bar text labels aren't cut off and finally returns the plot.
+
+We call this function 4 times
+
+* once for all the data (and don't use a subtitle)
+* next for just those experienced with clinical data (using a subtitle to specify this)
+* next for just those experienced with human genomic data (using a subtitle to specify this)
+* and finally for just those experienced with non-human genomic data (using a subtitle to specify this)
+
+
+
+```{r, message=FALSE, echo=FALSE}
+
+plot_which_data <- function(inputToPlotDF, subtitle = NULL){
+
+ toreturnplot <- ggplot(inputToPlotDF, aes(x = reorder(whichControlledAccess, -count), y = count, fill = AnVIL_Availability)) +
+ geom_bar(stat="identity") +
+ theme_classic() + theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle=45, hjust=1)) +
+ xlab("Controlled access datasets") + ylab("Count") +
+ ggtitle("What large, controlled access datasets do you access\nor would you be interested in accessing using the AnVIL?", subtitle = subtitle) +
+ geom_text(aes(label = after_stat(y), group = whichControlledAccess),
+ stat = 'summary', fun = sum, vjust = -1, size=2) +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("#25445A", "#7EBAC0", "grey")) +
+ theme(legend.position = c(0.8, 0.8))
+
+return(toreturnplot)
+
+}
+
+```
+
+
+```{r, message=FALSE, echo = FALSE}
+everyoneDataPlot <- plot_which_data(whichDataDf)
+
+everyoneDataPlot
+
+ggsave(here("plots/whichcontrolleddata.png"), plot = everyoneDataPlot)
+```
+
+```{r, message=FALSE, echo=FALSE}
+clinicalDataPlot <- plot_which_data(whichDataClinicalSubset, subtitle = "Respondents moderately or extremely experienced with clinical data")
+
+clinicalDataPlot
+
+ggsave(here("plots/whichcontrolleddata_clinical.png"), plot = clinicalDataPlot)
+```
+
+```{r, message=FALSE, echo=FALSE}
+humanGenomicDataPlot <- plot_which_data(whichDataHumanGenomicSubset, subtitle = "Respondents moderately or extremely experienced with human genomic data")
+
+humanGenomicDataPlot
+
+ggsave(here("plots/whichcontrolleddata_humangenomic.png"), plot = humanGenomicDataPlot)
+```
+
+```{r, message=FALSE, echo=FALSE}
+nonHumanGenomicDataPlot <- plot_which_data(whichDataNonHumanGenomicSubset, subtitle = "Respondents moderately or extremely experienced with non-human genomic data")
+
+nonHumanGenomicDataPlot
+
+ggsave(here("plots/whichcontrolleddata_nonhumangenomic.png"), plot = nonHumanGenomicDataPlot)
+```
+
+## Experience: Tool & Resource Knowledge/Comfort level
+
+**Takeaway:**
+
+### Prepare and plot the data
+
+Description of variable definitions and steps for preparing the data
+
+
+
+```{r, message=FALSE, echo = FALSE}
+toPlotToolKnowledge <- bind_rows(
+ resultsTidy %>%
+ filter(UserType == "CurrentUser") %>%
+ select(starts_with("Score_")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalScore")) %>%
+ mutate(nscores = sum(resultsTidy$UserType == "CurrentUser"),
+ avgScore = totalScore / nscores,
+ UserType = "Current Users") %>%
+ mutate(WhereTool = rownames(.)) %>%
+ separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>%
+ mutate(AnVILorNo =
+ case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL",
+ AnVILorNo == "Score_All" ~ "Separate from the AnVIL"
+ ),
+ Tool =
+ recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks",
+ "WDL" = "Workflows",
+ "CommandLine" = "Unix / Command Line",
+ "AccessData" = "Access controlled access data",
+ "Terra" = "Terra Workspaces",
+ "BioconductorRStudio" = "Bioconductor & RStudio"
+ )
+ ),
+ resultsTidy %>%
+ filter(UserType == "PotentialUser") %>%
+ select(starts_with("Score_AllTech")) %>%
+ colSums() %>%
+ as.data.frame() %>% `colnames<-`(c("totalScore")) %>%
+ mutate(nscores = sum(resultsTidy$UserType == "PotentialUser"),
+ avgScore = totalScore / nscores,
+ UserType = "Potential Users") %>%
+ mutate(WhereTool = rownames(.)) %>%
+ separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>%
+ mutate(AnVILorNo =
+ case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL",
+ AnVILorNo == "Score_All" ~ "Separate from the AnVIL"
+ ),
+ Tool =
+ recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks",
+ "WDL" = "Workflows",
+ "CommandLine" = "Unix / Command Line",
+ "AccessData" = "Access controlled access data",
+ "Terra" = "Terra Workspaces",
+ "BioconductorRStudio" = "Bioconductor & RStudio"
+ )
+ )
+) %>%
+ mutate(UserType = factor(UserType, levels = c("Potential Users", "Current Users")))
+```
+
+
+```{r, message=FALSE, echo = FALSE}
+roi <- toPlotToolKnowledge[which(toPlotToolKnowledge$Tool == "Bioconductor & RStudio"),]
+toPlotToolKnowledge <- rows_append(toPlotToolKnowledge, data.frame(
+ UserType = rep(roi$UserType,2),
+ avgScore = rep(roi$avgScore,2),
+ AnVILorNo = rep(roi$AnVILorNo,2),
+ Tool = c("Bioconductor", "RStudio")
+ )) %>%
+ rows_delete(., data.frame(roi))
+```
+
+
+Description of variable definitions and steps for plotting the dumbbell like plot
+
+Used [this Stack Overflow response](https://stackoverflow.com/a/72309061) to get the values for the `scale_shape_manual()`
+
+
+
+```{r, message=FALSE, echo = FALSE}
+# Provide a list of AnVIL only Tools
+AnVIL_only <-
+ setdiff(toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Current Users" &
+ toPlotToolKnowledge$AnVILorNo == "On the AnVIL", ]$Tool,
+ toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Potential Users", ]$Tool)
+
+# Order dummy column based only on Potential users
+toPlotToolKnowledge <-
+ toPlotToolKnowledge %>% mutate(ToolOrder = case_when(
+ UserType == "Potential Users" | Tool %in% AnVIL_only ~ avgScore,
+ TRUE ~ 0
+ ))
+
+PlotToolKnowledge_avg_score <-
+ ggplot(toPlotToolKnowledge, aes(y = reorder(Tool, avgScore), x = avgScore)) +
+ geom_point(aes(color = UserType, shape = AnVILorNo))
+
+PlotToolKnowledge_potential_user_score <-
+ ggplot(data = toPlotToolKnowledge) +
+ geom_point(data = toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Potential Users" | toPlotToolKnowledge$Tool %in% AnVIL_only ,],
+ aes(color = UserType, shape = AnVILorNo, y = reorder(Tool, ToolOrder), x = avgScore)) +
+ geom_point(data = toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Current Users",],
+ aes(color = UserType, shape = AnVILorNo, y = Tool, x = avgScore))
+
+PlotToolKnowledge_customization <- function(gg) {
+ return(
+ gg +
+ scale_x_continuous(breaks = 0:5, labels = 0:5, limits = c(0,5)) +
+ ylab("Tool or Data Resource") +
+ xlab("Average Knowledge or Comfort Score") +
+ theme_bw() +
+ theme(panel.background = element_blank(), panel.grid.minor.x = element_blank()) + #facet_wrap(~UserType, nrow=3) +
+ annotation_custom(textGrob("Don't know\nat all", gp=gpar(fontsize=8, fontface = "bold")),xmin=0,xmax=0,ymin=-2,ymax=-2) +
+ annotation_custom(textGrob("Extremely\ncomfortable", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-2,ymax=-2) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm")) +
+ ggtitle("How would you rate your knowledge of or\ncomfort with these technologies or data features?") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ scale_shape_manual(values = c(4, 16))
+ )
+}
+
+PlotToolKnowledge_customization(PlotToolKnowledge_avg_score)
+ggsave(here("plots/tooldataresourcecomfortscore_singlepanel.png"), w = 2200, h = 1350, units = "px")
+
+PlotToolKnowledge_customization(PlotToolKnowledge_potential_user_score)
+ggsave(here("plots/tooldataresourcecomfortscore_singlepanel_by_potential_users.png"), w = 2200, h = 1350, units = "px")
+```
+
+
+## Awareness: AnVIL Demos
+
+**Takeaway:**
+
+## Awareness: AnVIL Support Forum
+
+**Takeaway:**
+
+## Preferences: Feature importance for current vs potential users
+
+**Takeaway:**
+
+### Prepare and plot the data
+
+Average rank is total rank (sum of given ranks) divided by number of votes (number of given ranks)
+
+Description of variable definitions and steps for preparing the data
+
+We make two different dataframes that find the total ranks (column name: `totalRank`) and avg ranks (column name: `avgRank`) for each future and then row bind (`bind_rows`) these two dataframes together to make `totalRanksdf`. The reason that we make two separately are that one is for Potential users (`starts_with("PotentialRank")`) and one is for Current users (`starts_with("CurrentRank")`). They have a different number of votes `nranks` and so it made more sense to work with them separately, following the same steps and then row bind them together.
+
+The individual steps for each of these dataframes is to
+
+* `select` the relevant columns from `resultsTidy`
+* perform sums with `colSums`, adding together the ranks in those columns (each column corresponds to a queried feature); We set `na.rm = TRUE` to ignore the NAs (since not every survey respondent was asked each question; e.g., if they were a current user they weren't asked as a potential user)
+* send those sums to a data frame such that the selected column names from the first step are now the row names and the total summed rank is the only column with values in each row corresponding to each queried feature
+* Use a `mutate` to
+ * add a new column `nranks` that finds the number of responses in the survey are from potential users (e.g., the number that would have assigned ranks to the PotentialRank questions) or the number of responses in the survey that are from current/returning users (e.g., the number that would have assigned ranks to the CurrentRank questions).
+ * add a new column `avgRank` that divides the `totalRank` by the `nranks`
+
+After these two dataframes are bound together (`bind_rows`), the rest of the steps are for aesthetics in plotting and making sure ggplot knows the `UserType` and the feature of interest, etc.
+
+* We move the rownames to their own column `UsertypeFeature` (with the `mutate(UsertypeFeature = rownames(.))`).
+* We separate the values in that column on the word "Rank" to remove the `UsertypeFeature` column we just made but then make two new columns (`Usertype` and `Feature`) where `Usertype is either "Current" or "Potential", and the Features are listed in the code below, because...
+* We then use a `case_when` within a `mutate()` to fill out those features so they're more informative and show the choices survey respondents were given.
+
+
+
+```{r, message=FALSE, echo = FALSE}
+totalRanksdf <-
+ bind_rows(
+ resultsTidy %>%
+ select(starts_with("PotentialRank")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "PotentialUser"),
+ avgRank = totalRank / nranks),
+ resultsTidy %>%
+ select(starts_with("CurrentRank")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "CurrentUser"),
+ avgRank = totalRank /nranks)
+ ) %>%
+ mutate(UsertypeFeature = rownames(.)) %>%
+ separate(UsertypeFeature, c("Usertype", "Feature"), sep = "Rank", remove = TRUE) %>%
+ mutate(Feature =
+ case_when(Feature == "EasyBillingSetup" ~ "Easy billing setup",
+ Feature == "FlatRateBilling" ~ "Flat-rate billing rather than use-based",
+ Feature == "FreeVersion" ~ "Free version with limited compute or storage",
+ Feature == "SupportDocs" ~ "On demand support and documentation",
+ Feature == "ToolsData" ~ "Specific tools or datasets are available/supported",
+ Feature == "CommunityAdoption" ~ "Greater adoption of the AnVIL by the scientific community"),
+ Usertype = factor(case_when(Usertype == "Potential" ~ "Potential Users",
+ Usertype == "Current" ~ "Current Users"), levels = c("Potential Users", "Current Users"))
+ )
+```
+
+Description of variable definitions and steps for plotting the dumbbell plot
+
+We use the `totalRanksdf` we just made. The x-axis is the `avgRank` values, and the y-axis displays the informative `Feature` values, however, we `reorder` the y-axis so that more important (lower number) avgRank features are displayed higher in the plot.
+
+geom_point and geom_line are used in conjunction to produce the dumbbell look of the plot and we set the color of the points to correspond to the `Usertype`
+
+Some theme things are changed, labels and titles added, setting the color to match AnVIL colors, and then we display and save that plot.
+
+The first version of the plot has trimmed limits, so the second version sets limits on the x-axis of 1 to 6 since those were the options survey respondents were given for ranking. It also adds annotations (using [Grobs, explained in this Stack Overflow post answer](https://stackoverflow.com/a/31081162)) to specify which rank was "Most important" and which was "Least important".
+
+Then we've also adjusted the left margin so that the annotation isn't cut off.
+
+We then display and save that version as well.
+
+Finally, we'll reverse the x-axis so that most important is on the right and least important is on the left. We use `scale_x_reverse()` for that. We have to change our group annotations so that they are now on the negative number version of `xmin` and `xmax` that we were using previously. We then display and save that version as well.
+
+
+
+```{r, message=FALSE, echo = FALSE}
+gdumbbell <- ggplot(totalRanksdf, aes(x = avgRank, y = reorder(Feature, -avgRank))) +
+ geom_line() +
+ geom_point(aes(color = Usertype), size = 3) +
+ theme(panel.background = element_blank()) + theme_bw() + theme(legend.position = "bottom") +
+ xlab("Average Rank") +
+ ylab("Feature") +
+ ggtitle("Rank the following features\naccording to their importance to\nyou as a potential user or for\nyour continued use of the AnVIL") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ theme(legend.title = element_blank())
+
+gdumbbell <- gdumbbell +
+ scale_x_continuous(breaks = 1:6, labels = 1:6, limits = c(1,6))+
+ annotation_custom(textGrob("Most\nimportant", gp=gpar(fontsize=8, fontface = "bold")),xmin=1,xmax=1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\nimportant", gp=gpar(fontsize=8, fontface= "bold")),xmin=6,xmax=6,ymin=-0.5,ymax=-0.5) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm"))
+
+ggsave(here("plots/dumbbellplot_xlim16_rankfeatures.png"), plot = gdumbbell)
+
+gdumbbell <- gdumbbell +
+ scale_x_reverse(limits = c(6,1), breaks = 6:1, labels = 6:1) +
+ annotation_custom(textGrob("Most\nimportant", gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\nimportant", gp=gpar(fontsize=8, fontface= "bold")),xmin=-6,xmax=-6,ymin=-0.5,ymax=-0.5)
+
+gdumbbell
+
+ggsave(here("plots/dumbbellplot_xlim16_revaxis_rankfeatures.png"), plot = gdumbbell)
+
+```
+
+## Preferences: Training Workshop Modality
+
+**Takeaway:**
+
+### Prepare and plot the data
+
+Description of variable definitions and steps for preparing the data
+
+
+
+```{r, message=FALSE, echo = FALSE}
+toPlotTrainingRanks <- bind_rows(
+ resultsTidy %>%
+ filter(UserType == "CurrentUser") %>%
+ select(starts_with("AnVILTrainingWorkshops")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "CurrentUser"),
+ avgRank = totalRank / nranks,
+ UserType = "Current Users") %>%
+ mutate(TrainingType = rownames(.)) %>%
+ mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", "")),
+ resultsTidy %>%
+ filter(UserType == "PotentialUser") %>%
+ select(starts_with("AnVILTrainingWorkshops")) %>%
+ colSums() %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "PotentialUser"),
+ avgRank = totalRank / nranks,
+ UserType = "Potential Users") %>%
+ mutate(TrainingType = rownames(.)) %>%
+ mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", ""))
+ ) %>% mutate(TrainingType = recode(TrainingType, "SpecEvent" = "AnVIL-specific event", "OnSite" = "On-site at my institution", "Conference" = "Conference (e.g., CSHL, AMIA)")) %>%
+ mutate(UserType = factor(UserType, levels = c("Potential Users", "Current Users")))
+
+```
+
+Description of variable definitions and steps for plotting the dumbbell plot
+
+
+
+```{r, message=FALSE, echo = FALSE}
+tdumbbell <- ggplot(toPlotTrainingRanks, aes(x = avgRank, y = reorder(TrainingType, -avgRank))) +
+ geom_line() +
+ geom_point(aes(color = UserType), size = 3) +
+ theme(panel.background = element_blank()) + theme_bw() + theme(legend.position = "bottom") +
+ xlab("Average Rank") +
+ ylab("Training Workshop Modality") +
+ ggtitle("Please rank how/where you would prefer to attend\nAnVIL training workshops.") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ theme(legend.title=element_blank())
+
+tdumbbell <- tdumbbell +
+ scale_x_continuous(breaks = 5:1, labels = 5:1, limits = c(1,5))+
+ annotation_custom(textGrob("Most\npreferred", gp=gpar(fontsize=8, fontface = "bold")),xmin=1,xmax=1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\npreferred", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-0.5,ymax=-0.5) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm"))
+
+ggsave(here("plots/dumbbellplot_xlim15_trainingmodalitypref.png"), plot = tdumbbell)
+
+tdumbbell <- tdumbbell +
+ scale_x_reverse(limits = c(5,1)) +
+ annotation_custom(textGrob("Most\npreferred", gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\npreferred", gp=gpar(fontsize=8, fontface= "bold")),xmin=-5,xmax=-5,ymin=-0.5,ymax=-0.5)
+
+tdumbbell
+
+ggsave(here("plots/dumbbellplot_xlim15_revaxis_trainingmodalitypref.png"), plot = tdumbbell)
+```
+
+## Returning User Specific: Likely to recommend?
+
+**Takeaway:**
+
+## Returning User Specific: Number of years of use
+
+**Takeaway:**
+
+## Returning User Specific: Foreseeable Computational Needs
+
+**Takeaway:**
+
+## Session Info and other analysis notes
+
+Session Info
+
+```{r}
+sessionInfo()
+```
+
+
\ No newline at end of file
diff --git a/anvilPoll2024MainAnalysis.html b/anvilPoll2024MainAnalysis.html
new file mode 100644
index 0000000..c2eed33
--- /dev/null
+++ b/anvilPoll2024MainAnalysis.html
@@ -0,0 +1,1695 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+State of the AnVIL 2024
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Data were read in via a Google Sheet on the AnVIL Team Drive.
+
+
+Import details
+
+
The google sheet we are reading in is stored in an AnVIL Google drive
+folder State of the AnVIL 2024. Its permissions are
+restricted such that only people with access can open with the link.
+Using gs4_auth() to authorize my google account before
+running this code, I needed to change the scopes argument,
+specifically scopes=spreadsheets.readonly was
+necessary.
+
In this google sheet, each question is a column, and each response to
+the survey is a row. If the respondant wasn’t asked or didn’t answer a
+specific question, there is an NA in the corresponding row/column.
+
+
+
+
Clean data
+
Note: Every code block in this section edits the
+resultsTidy data frame and should be run before plotting
+within the # Insights section below. Subsections are marked
+according to which Insight they are related to, but cleaning steps like
+identifying the user type are important for most every plot.
+
+
Set Column Names
+
We set the column names to simplified column names (e.g., that help
+us select related columns for various analyses) by reading in a codebook
+(data/codebook.txt).
+
+
+Simplifying column names details
+
+
+
+Description of variable definitions and steps
+
+
We have a codebook that is a tab delmited file and has 4 columns, and
+each row represents a question in the survey. The first column lists
+a/the question from the survey (SurveyColNames); the second
+column lists a corresponding simplified column name for that survey
+question (SimplifedColNames); the third column describes
+the variable format (VariableFormat), e.g, is it a double,
+or a character; the fourth column gives a lengthier description of the
+question (Description), e.g., who was asked it, what
+possible answers are, etc.
+
This code block reads in that codebook and specifically selects the
+SimplifiedColNames column. It then renames the column names
+of the raw results from the google sheet (where each question is a
+column) with these simplified column names.
Keep last response if duplicated according to email (if email
+provided)
+
Choosing to select the last response because the respondent may have
+spent more time thinking about how they wanted to respond after their
+initial response.
+
+
+Filtering duplicated responses details
+
+
+
+Description of variable definitions and steps
+
+
+
The table function tabulates the number of occurrences,
+and we tell it to ignore literal NAs. Because providing an email was
+optional, we expect many NA responses. The table function,
+by ignoring NAs, will return the unique emails and the number of times
+each email was used. We store the tabulated results in the variable
+tabulatedEmails
+
Using the sum function, we look to see how many
+emails/responses are provided more than once.
+tabulatedEmails > 1 is returning a vector of TRUEs and
+FALSEs where TRUE means that there was more than one instance/count of a
+given email and FALSE means there wasn’t. The sum function
+in essence counts the number of TRUEs and if the sum is
+greater than 0, that means there is at least one duplicated email whose
+count is greater than 1.
+
duplicatedEmails reports which emails are duplicated by
+using the tabulated/table of emails. First it identifies which emails
+were observed more than once, using the which function, and
+uses the indices returned from that to index the names of
+the tabulated emails, grabbing the specific emails.
+
We want to know which entries from the overall survey responses to
+remove for each duplicated email. Ideally, we want to remove the
+responses all at the same time or go backwards removing one at a time,
+because we don’t want to affect downstream indices. The approach here,
+keeps track of all the indices of interest and removed them at the same
+time.
+
+
Therefore, we’ll use lapply to loop through the
+duplicated emails (duplicatedEmails) and grab the index for
+survey responses associated with that email address
+(which(resultsTidy$Email == duplicatedEmails[x])).
+
However, we want to keep the last survey response for each
+duplicated email. Therefore, we wrap that which function in
+head(_,-1 ) function so that it grabs all indices except
+the last one.
+
Finally, we unlist the indices so that there’s a single
+vector associated with indices for any duplicated email responses to be
+removed IDXs_to_remove. And since we want to remove them
+all at the same time, we subset resultsTidy, grabbing every
+row except those in IDXs_to_remove, as denoted by the
+-.
The first question of the poll asks respondents to describe their
+current usage of the AnVIL and allows us to categorize respondents as
+potential or current users of the AnVIL.
+
+
+Question and possible answers
+
+
+
How would you describe your current usage the AnVIL platform?
+
+
Possible answers include:
+
+
For completed/long-term projects (e.g., occasional
+updates/maintenance as needed)
+
For ongoing projects (e.g., consistent project development and/or
+work)
+
For short-term projects (e.g., short, intense bursts separated by a
+few months)
+
I do no currently use the AnVIL, but have in the past
+
I have never heard of the AnVIL
+
I have never used the AnVIL, but have heard of it.
+
+
The first three possible answers represent current or returning AnVIL
+users. The last three possible answers represent potential AnVIL
+users.
+
+
+
+Identifying user type details
+
+
+
+Description of variable definitions and steps
+
+
We use case_when to evaluate the response in the
+CurrentUsageDescription column and assign a corresponding,
+simplified label of “CurrentUser” or “PotentialUser’. In other words we
+translate the given response to a user label. Using the
+case_when as the internal nested function of the
+mutate function, means that the translation is then saved
+in a new column, UserType.
+
+
resultsTidy %<>%
+ mutate(
+ UserType = case_when(
+ CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "CurrentUser",
+ CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "CurrentUser",
+ CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "CurrentUser",
+ CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "PotentialUser",
+ CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "PotentialUser",
+ CurrentUsageDescription == "I have never heard of the AnVIL" ~ "PotentialUser"
+ )
+ ) %>%
+ mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser")))
Users were able to disclose their institutional affiliation using a
+free text response, therefore we needed to synchronize institution names
+(example: Johns Hopkins and Johns Hopkins University refer to the same
+institution, despite the difference in the free responses) and added
+simplified affiliation categories ([R1 University, R2 University,
+Community College, Medical Center or School, International Location,
+Research Center, NIH, Industry, Unknown] and [Research Intensive,
+Education Focused, and Industry & Other]). The first level of
+affiliation categories are notated in an institution specific codebook
+(data/institution_codebook.txt)
This synchronization corrects for the various spellings and
+capitalizations used for the same institution (ex, Johns Hopkins and
+Johns Hopkins University refer to the same institution, despite the
+difference in the free responses).
+
+
+Description of variable definitions and steps
+
+
We use a recode() within a mutate() to
+synchronize the institutional affiliations as necessary
+
+
resultsTidy %<>%
+ mutate(
+ InstitutionalAffiliation =
+ recode(
+ InstitutionalAffiliation,
+ "Broad" = "Broad Institute",
+ "broad institute" = "Broad Institute",
+ "CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" = "City University of New York",
+ "harvard" = "Harvard University",
+ "Harvard Public Health" = "Harvard University",
+ "Johns hopkins" = "Johns Hopkins",
+ "Johns Hopkins University" = "Johns Hopkins",
+ "OHSU" = "Oregon Health & Science University",
+ "OHSU (Knight Center)" = "Oregon Health & Science University",
+ "The Ohio State University" = "Ohio State University",
+ "UCSC" = "University of California Santa Cruz",
+ "univ. ca. santa cruz" = "University of California Santa Cruz",
+ "university of California santa cruz" = "University of California Santa Cruz",
+ "UMASS Chan Medical School" = "UMass Chan Medical School",
+ "Umass Chan Medical School" = "UMass Chan Medical School",
+ "Washington University in St Louis" = "Washington University in St. Louis",
+ "yikongene" = "Yikon Genomics",
+ "v" = "Unknown"
+ )
+ )
+
Elizabeth Humphries grouped institutional affiliations into a limited
+set of categories: R1 University, R2 University, Community College,
+Medical Center or School, International Location, Research Center, NIH,
+Industry, Unknown and we notated those groupings/labels within the
+institution_codebook.txt data file, . Grouping into limited
+institutional affiliation categories allows us to consolidate free
+answers for easier data visualization and identification of trends.
+
+
+Description of variable definitions and steps
+
+
We use a read_delim() to read in the
+institution_codebook file, and select just the
+InstitutionalAffiliation and InstitutionalType
+columns (ignoring the column that specifies how institutions were
+entered by survey respondents). We then use a full_join by the
+InstitutionalAffiliation column to add an
+InstitutionalType column such that the category labels are
+now included as a new column, joining the appropriate values dependent
+upon the InstitutionalAffiliation column.
Here we even further simplify Institutional Affiliations to focus on
+Research Intensive, Education Focused, and Industry & Other
+
This groups R1 University, Research Center, Medical Center or School,
+and NIH as “Research Intensive”; R2 University & Community College
+as “Education Focused”; and Industry, International Location, or Unknown
+as “Industry & Other”.
This question allowed more than one response, however, only one
+response selected two (PhD, MD), which we recoded to be MD/PhD. We
+simplify the possible responses to group attained or in progress
+degrees
+
+
+Question and possible answers
+
+
+
What is the highest degree you have attained?
+
+
Possible answers include (and multiple choices could be selected and
+would be comma separated if so)
+
+
High school or equivalent
+
Bachelor’s degree
+
Master’s degree in progress
+
Master’s degree
+
PhD in progress
+
PhD
+
MD in progress
+
MD
+
Other (with free text entry)
+
+
+
+
+Degree recoding details
+
+
+
+Description of variable definitions and steps
+
+
Because multiple responses could be selected and those would be comma
+separated and because free text response was possible if other was
+selected, we need to tidy the data from this question. From visual
+inspection of the data, I see that the only time multiple responses were
+selected were for MD/PhD. No other’s were selected. So we’ll just recode
+“PhD, MD” to be “MD/PhD”
+
Let’s also set the factor levels to follow the general progress of
+degrees
+
+
resultsTidy %<>%
+ mutate(
+ Degrees =
+ factor(recode(Degrees, "PhD, MD" = "MD/PhD"), levels = c("High School or equivalent", "Bachelor's degree", "Master's degree in progress", "Master's degree", "PhD in progress", "PhD", "MD in progress", "MD", "MD/PhD")),
+ FurtherSimplifiedDegrees = recode(Degrees,
+ "Master's degree in progress" = "Master's degree (or in progress)",
+ "Master's degree" = "Master's degree (or in progress)",
+ "PhD in progress" = "PhD (or in progress)",
+ "PhD" = "PhD (or in progress)",
+ "MD/PhD" = "MD (MD, MD/PhD, or in progress)",
+ "MD in progress" = "MD (MD, MD/PhD, or in progress)",
+ "MD" = "MD (MD, MD/PhD, or in progress)"
+ )
+ )
+
+
+
+
Tool Knowledge and Comfort Separate from the AnVIL and on the
+AnVIL
+
We want to recode these responses to set the factor level/progression
+from Don’t know it, not at all comfortable, all the way to extremely
+comfortable and make corresponding integer comfort scores.
+
+
+Question and possible answers
+
+
+
How would you rate your knowledge of or comfort with these
+technologies (separate from the AnVIL)? How would you rate your
+knowledge of or comfort with these technologies (on the AnVIL)? How
+would you rate your knowledge of or comfort with these AnVIL data
+features? Shared technologies between these two questions include
DUOS (Data Use Oversight System):
+CurrentAnVILTechDUOS
+
Terra on AnVIL (Workspaces): CurrentAnVILTechTerra
+
TDR (Terra Data Repository): CurrentAnVILTechTDR
+
+
Possible answers for each of these questions include
+
+
Don’t know it (0)
+
Not at all comfortable (1)
+
Slightly comfortable (2)
+
Somewhat comfortable (3)
+
Moderately comfortable (4)
+
Extremely comfortable (5)
+
+
Notated possible “comfort scores” in parentheses next to each
+possible answer. We’ll add these as additional columns that now start
+with the word “Score_” but otherwise retain the column name, in case
+it’s helpful to still have the words (whose factor level we’ll set to
+reflect the progression of knowledge/comfort).
+
Responses are NA if the question wasn’t asked to the survey taker
+(e.g., they were a potential user and weren’t asked about technologies
+with regards to the AnVIL)
+
+
+
+Cleaning Comfort level/scores for various technologies and resources
+details
+
+
It’s likely that someone who’s a program administrator will select
+don’t know for these…. should we remove them and see how average scores
+change?
+
+
+Description of variable definitions and steps
+
+
We select the relevant columns (those that start with
+“CurrentAnVILTech” or “AllTech”) we want to work with. We don’t want
+them to be lists. The non-tidyverse way of doing this would be
+unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup)).
+We can use the unnest tidyverse function with a
+keep_empty = TRUE argument so that it preserves the NULL
+values. Notice in the non-tidyverse way, we had to use
+as.character in order to preserve the null values. In the
+tidyverse way, we still have to use an as.character type change before
+the unnest, otherwise, we get an error that double and
+character values can’t be combined.
+
After the unnest we can use the mutate
+function to first work with these as factors (to set the progression we
+want from don’t know it all the way to extremely comfortable) and hten
+to make the replacements specified above for an integer score in place
+of the comfort level, placing these scores in new columns with names
+that begin with “Score_” and fill in the rest of the column name with
+the corresponding original column name.
Feature importance: Comparisons of rank of importance of
+features/resources between Current Users and Potential Users
+
We want to recode these responses to remove labels and make them
+integers.
+
+
+Question and possible answers
+
+
+
Rank the following features or resources according to their
+importance for your continued use of the AnVIL
+
+
+
Rank the following features or resources according to their
+importance to you as a potential user of the AnVIL?
+
+
+
Easy billing setup
+
Flat-rate billing rather than use-based
+
Free version with limited compute or storage
+
On demand support and documentation
+
Specific tools or datasets are available/supported
+
Greater adoption of the AnVIL by the scientific community
+
+
We’re going to look at a comparison of the assigned ranks for these
+features, comparing between current users and potential users.
+
+
Recode rank values
+
+
+Description of variable definitions and steps
+
+
Columns of interest include
+
+
PotentialRankEasyBillingSetup
+
PotentialRankFlatRateBilling
+
PotentialRankFreeVersion
+
PotentialRankSupportDocs
+
PotentialRankToolsData
+
PotentialRankCommunityAdoption
+
CurrentRankEasyBillingSetup
+
CurrentRankFlatRateBilling
+
CurrentRankFreeVersion
+
CurrentRankSupportDocs
+
CurrentRankToolsData
+
CurrentRankCommunityAdoption
+
+
+
+
+Cleaning the feature importance ranks details
+
+
+
+Description of variable definitions and steps
+
+
We can use starts_with to select these columns,
+specifically focusing on the starts with “PotentialRank” and
+“CurrentRank”. When we made simplified names for the columns, these are
+the only twelve that start like that.
+
Either the 6 CurrentRank or the 6 PotentialRank were asked to each
+survey taker which means that we expect NULL values in these columns
+since not every survey taker will have answered all of these
+questions.
+
We want to recode the following values
+
+
Replace 1 (Most important in this list) with 1
+
Replace 6 (Least important in this list) with 6
+
+
Before we can do that, we first need to change the type of the
+columns in several ways. We don’t want them to be lists. The
+non-tidyverse way of doing this would be
+unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup)).
+We can use the unnest tidyverse function with a
+keep_empty = TRUE argument so that it preserves the NULL
+values. Notice in the non-tidyverse way, we had to use
+as.character in order to preserve the null values. In the
+tidyverse way, we still have to use an as.character type change before
+the unnest, otherwise, we get an error that double and
+character values can’t be combined.
+
After the unnest we can use the recode
+function to make the replacements specified above. And then we go ahead
+and change the type from character to integer so that we can compute
+average rank & plot them more easily. There will be a warning that
+NAs are introduced by coercion when we change the type to integer. So we
+add a replacement in the recode, changing “NULL” to the
+NA_character_
The responses are stored in the starts with
+AnVILTrainingWorkshops columns
+
+
+
+Cleaning the training modality ranks details
+
+
+
+Description of variable definitions and steps
+
+
We can use starts_with to select these columns,
+specifically focusing on the starts with “AnVILTrainingWorkshops”. These
+are the only 5 that start like that when we made simplified column
+names.
+
We want to recode the following values
+
+
Replace 1 (Most preferred in this list) with 1
+
Replace 5 (Least preferred in this list) with 5
+
+
Before we can do that, we first need to change the type of the
+columns in several ways. We don’t want them to be lists. We can use the
+unnest tidyverse function with a
+keep_empty = TRUE argument so that it preserves any NULL
+values, but first we have to use an as.character type
+change before the unnest, otherwise, we get an error that
+double and character values can’t be combined.
+
After the unnest we can use the recode
+function to make the replacements specified above. And then we go ahead
+and change the type from character to integer so that we can compute
+average rank & plot them more easily. There will be a warning that
+NAs are introduced by coercion when we change the type to integer. So we
+add a replacement in the recode, changing “NULL” to the
+NA_character_
Simplified experience status for various research categories
+(clinical, human genomics, non-human genomics)
+
Want to add three columns that act as flags reporting if the
+respondent is
+
+
experienced with clinical research, specifically either moderately
+or extremely experienced in working with human clinical data
+
experienced with human genomics research, specifically is moderately
+or extremely experienced in working with human genomics data
+
experienced with non-human genomics research expert, specifically is
+moderately or extremely experienced in working with non-human genomics
+data
+
+
We will use this information later to subset responses when
+considering popular tools or datasets.
+
+
+Question and possible answers
+
+
+
How much experience do you have analyzing the following data
+categories?
+
+
The three research categories people are asked about include
+
+
Human Genomic
+
Non-human Genomic
+
Human Clinical
+
+
Possible answers include
+
+
Not at all experienced
+
Slightly experienced
+
Somewhat experienced
+
Moderately experienced
+
Extremely experienced.
+
+
+
+
+Setting research category experience flag details
+
+
+
+Description of variable definitions and steps
+
+
We use a mutate together with 3
+case_when’s.
+
+
If the HumanClinicalExperience column response is
+“Moderately experienced” or “Extremely experienced”, we mark that
+respondent as a human clinical research expert in the
+clinicalFlag column (TRUE). Otherwise, we mark
+a FALSE to signify they are not a clinical research
+expert.
+
If the HumanGenomicExperience column response is
+“Moderately experienced” or “Extremely experienced”, we mark that
+respondent as a human genomic research expert in the
+humanGenomicFlag column (TRUE). Otherwise, we
+again mark a FALSE to signify not an expert.
+
If the NonHumanGenomicExperience column response is
+“Moderately experienced” or “Extremely experienced”, we mark that
+respondent as a non-human genomic research expert in the
+nonHumanGenomicFlag column (TRUE). Otherwise,
+we again mark a FALSE to signify not an expert.
Takeaway: Of the 50 responses,
+22 were current users and 28 were potential
+users. The majority of current users belonged to the group who use the
+AnVIL for ongoing projects while the majority of potential users were
+evenly split between those who have never used the AnVIL (but have heard
+of it) and those who used to previously use the AnVIL, but don’t
+currently.
+
Potential Follow-ups:
+
+
Look to see if those potential users who previously used to use the
+AnVIL show similarity in overall trends with the rest of the potential
+users
+
Directly ask why they no longer use the AnVIL (Elizabeth mentioned
+the possiblity that the AnVIL is sometimes used in courses or workshops
+and students may not use it after that)
+
+
+
Prepare and plot the data
+
+
+Description of variable definitions and steps
+
+
First, we group the data by the assigned UserType labels/categories
+and their related more detailed descriptions. Then we use
+summarize to count the occurances for each of those
+categories. We use a mutate statement to better fit the detailed
+descriptions on the plot. We then send this data to ggplot with the
+count on the x-axis, and the usage descriptions on the y-axis (ordered
+by count so highest count is on the top). We fill with the usertype
+description we’ve assigned. We manually scale the fill to be AnVIL
+colors and specify we want this to be a stacked bar chart. We then make
+edits for the theme and labels and finally add a geom_text label for the
+count next to the bars before we save the plot.
+
+
+
+
+
+
Demographics: Institutional Affiliation
+
Takeaway:
+
+
Prepare and plot the data
+
+
+
+
+
Demographics: Highest Degree Attained
+
Takeaway:
+
+
Prepare and plot the data
+
+
+Description of variable definitions and steps
+
+
First we use group_by() to selectDegrees
+and UserType in conjunction with
+summarize( = n()) to add counts for how many of each combo
+are observed in the data.
+
Then we send this data to ggplot and make a bar chart with the x-axis
+representing the degrees (reordered by the count number
+such that higher counts are first (and the sum) because otherwise the 2
+MDs are located after the high school and master’s in progress bars (1
+each)). The y-axis represents the count, and the fill is used to specify
+user type (current or potential AnVIL users). We use a stacked bar chart
+and include labels above each bar of the total sum for that degree
+type.
The rest of the changes are related to theme and labels and making
+sure that the numerical bar labels aren’t cut off on the top.
+
+
+
+
+
+
Experience: Genomics and Clinical Research Experience
+
Takeaway:
+
+
Prepare and plot the data
+
+
+Description of variable definitions and steps for preparing the data
+
+
Here we select the columns containing answers for each data category:
+HumanGenomicExperience,
+HumanClinicalExperience, and
+NonHumanGenomicExperience. We also select
+UserType in case we want to split user type out at all in
+viewing the data. We use a pivot_longer to make a long
+dataframe that can be grouped and groups counted. The category/column
+names go to a new column, researchType and the values in
+those columns go to a new column experienceLevel. Before we
+use group by and count, we set the factor level on the new
+experienceLevel column to match the progression from not at
+all experienced to extremely experienced, and we rename the research
+categories so that the words have spaces, and we say research instead of
+experience. Then we use group_by and summarize
+to add counts for each combination of research category, experience
+level, and usertype. These counts are in the new n
+column.
+
+
+
+Description of variable definitions and steps for plotting the bar graph
+
+
We didn’t observe big differences between current and potential
+users, so we believe this grouped plot is useful for understanding the
+community as a whole.
+
This bar plot has the experience level on the x-axis, the count on
+the y-axis, and fills the bars according to the experience level (though
+the fill/color legend is turned off by setting legend.position to none).
+We facet the research category type and label the bars. We keep a
+summary stat and sum function and after_stat(y) for the label since the
+data has splits like usertype that we’re not visualizing here.
+
We adjust various aspects of the theme like turning off the grid and
+background and rotating the x-tick labels and changing the x- and y-axis
+labels. We also slightly widen the left axis so that the tick labels
+aren’t cut off.
+
+
+
+
+
+
Experience: Controlled Access Datasets
+
Takeaway:
+
+
+Question and possible answers
+
+
+
What large, controlled access datasets do you access or would you be
+interested in accessing using the AnVIL?
+
+
+
All of Us*
+
Centers for Common Disease Genomics (CCDG)
+
The Centers for Mendelian Genomics (CMG)
+
Clinical Sequencing Evidence-Generating Research (CSER)
+
Electronic Medical Records and Genomics (eMERGE)
+
Gabriella Miller Kids First (GMKF)
+
Genomics Research to Elucidate the Genetics of Rare Diseases
+(GREGoR)
+
The Genotype-Tissue Expression Project (GTEx)
+
The Human Pangenome Reference Consortium (HPRC)
+
Population Architecture Using Genomics and Epidemiology (PAGE)
+
Undiagnosed Disease Network (UDN)
+
UK Biobank*
+
None
+
Other (Free Text Response)
+
+
Since this is a select all that apply question, we expect that there
+will be multiple responses that are comma separated. The free text
+responses will likely need recoded as well. The responses are in the
+AccessWhichControlledData column.
+
+
+
Prepare and plot the data
+
+
+Description of variable definitions and steps for preparing the data
+
+
Making a function prep_df_whichData() since we’ll be
+using this workflow a few times for different subsets of the data,
+because we want to be able to differentially display the data based on
+the experience status (experienced with clinical research, human
+genomics research, etc.) of the person saying they’d like access to the
+data.
+
We want to color the bars based on whether or not the controlled
+access dataset is available on the AnVIL currently. We create a
+dataframe onAnVILDF to report this. Used the AnVIL dataset
+catalog/browser to find out this information. However, HPRC and
+GREGoR don’t show up in that resource, but are both available per these
+sources: Announcement
+for HPRC, Access for HPRC,
+Access for
+GREGoR. Both GMKF and TCGA are data hosted on other NCPI platforms
+that are accessible via AnVIL because of interoperability. (See: https://www.ncpi-acc.org/ and https://ncpi-data.org/platforms). We list these as
+non-AnVIL hosted since while accessible, they are not AnVIL hosted and
+inaccessible without NCPI. Finally, UDN is described as non-AnVIL hosted
+as it is in the Data submission pipeline and not yet available.
+
We’ll join this anvil-hosted or not data with the actual data at the
+end.
+
Given the input subset_df, we expect several answer to
+be comma separated. Since there are 12 set possible responses (not
+including “None”) and one possible free response answer, we separate the
+AccessWhichControlledData column into 13 columns (“WhichA”
+through “WhichN”), separating on a comma (specifically a “,” a comma
+followed by a space, otherwise there were duplicates where the
+difference was a leading space). Alternative approaches should consider
+using str_trim. We set fill to “right” but this
+shouldn’t really matter. It’s just to suppress the unnecessary warning
+that they’re adding NA’s when there aren’t 13 responses. If there’s only
+one response, it’ll put that response in WhichA and fill
+the rest of them with NA. If there’s two responses, it’ll
+put those two responses in WhichA and WhichB
+and fill the rest of them with NA… etc,
+
We then use pivot_longer to grab these columns we just
+made and put the column names in a new column WhichChoice
+and the values in the each column to a new column
+whichControlledAccess. We drop all the NAs in this new
+whichControlledAccess column (and there’s a lot of them
+there)…
+
Then we group by the new whichControlledAccess column
+and summarize a count for how many there are for each response.
+
Then we pass this to a mutate and recode function to simplify the
+fixed responses to be just their acronyms, to remove asterisks (that let
+the survey respondent know that that dataset wasn’t available because of
+policy restrictions), and to recode the free text responses (details
+below in “Notes on free text response recoding”).
+
We use a left_join() to join the cleaned data with a
+dataframe that specifies whether that dataset is currently available on
+the AnVIL or not. It’s a left join rather than a full join so it’s only
+adding the annotation for datasets that are available in the
+results.
+
Finally, we return this subset and cleaned dataframe so that it can
+be plotted.
+
+
+
+Additional notes on free text response recoding
+
+
There were 4 “Other” free response responses
+
+
“Being able to pull other dbGap data as needed.” –> We recoded
+this to be an “Other”
+
“GnomAD and ClinVar” –> GnomAD and ClinVar are not controlled
+access datasets so we recoded that response to be “None”
+
“Cancer omics datasets” –> We recoded this to be an “Other”
+
“TCGA” –> This response was left as is since there is a
+controlled access tier.
+
+
+
+
+Description of variable definitions and steps for preparing the data
+continued
+
+
Here we set up 4 data frames for plotting
+
+
The first uses all of the responses and sends them through the
+prep_df_whichData() function to clean the data for plotting
+to see which controlled access datasets are the most popular.
+
The second filters to grab just the responses from those experienced
+in clinical research using the clinicalFlag column
+(described earlier in the Clean Data -> Simplified experience status
+for various research categories (clinical, human genomics, non-human
+genomics) subsection)
+
The third filters to grab just the responses from those experienced
+in human genomic research using the humanGenomicFlag column
+(described earlier in the Clean Data -> Simplified experience status
+for various research categories (clinical, human genomics, non-human
+genomics) subsection)
+
The fourth filters to grab just the responses from those experienced
+in non-human genomic research using the nonHumanGenomicFlag
+column (described earlier in the Clean Data -> Simplified experience
+status for various research categories (clinical, human genomics,
+non-human genomics) subsection)
+
+
+
+
+Description of variable definitions and steps for plotting the bar
+graphs
+
+
Also have a function here because it’s the same plotting steps for
+each just changing the subtitle and which dataframe is used as
+input.
+
This takes the input dataframe and plots a bar plot with the x-axis
+having the controlled access datasets listed (reordering the listing
+based off of the count so most popular is on the left), the count
+number/popularity of requested is on the y-axis, and the fill is based
+on whether the dataset is available on AnVIL or not.
+
We change the theme elements like removing panel borders, panel
+background, and panel grid, and rotate the x-axis tick labels. We add an
+x- and y- axis label and add a title (and subtitle if specified - which
+it will be when we’re looking at just a subset like those who are
+experienced with clinical data)
+
We also add text labels above the bars to say how many times each
+dataset was marked/requested. Note that we have to use the after_stat,
+summary, and sum way of doing it again because we use recoding and if we
+want the labels to be accurate, it has to capture every time we’ve
+recoded things to be the same after we used group_by and summarize to
+count before we recoded. It uses `coord_cartesian(clip = “off”) so these
+bar text labels aren’t cut off and finally returns the plot.
+
We call this function 4 times
+
+
once for all the data (and don’t use a subtitle)
+
next for just those experienced with clinical data (using a subtitle
+to specify this)
+
next for just those experienced with human genomic data (using a
+subtitle to specify this)
+
and finally for just those experienced with non-human genomic data
+(using a subtitle to specify this)
+
+
+
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
+## 3.5.0.
+## â„ą Please use the `legend.position.inside` argument of `theme()` instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+
+
+Description of variable definitions and steps for preparing the data
+
+
+
+
+Description of variable definitions and steps for plotting the dumbbell
+like plot
+
+
Preferences: Feature importance for current vs potential users
+
Takeaway:
+
+
Prepare and plot the data
+
Average rank is total rank (sum of given ranks) divided by number of
+votes (number of given ranks)
+
+
+Description of variable definitions and steps for preparing the data
+
+
We make two different dataframes that find the total ranks (column
+name: totalRank) and avg ranks (column name:
+avgRank) for each future and then row bind
+(bind_rows) these two dataframes together to make
+totalRanksdf. The reason that we make two separately are
+that one is for Potential users
+(starts_with("PotentialRank")) and one is for Current users
+(starts_with("CurrentRank")). They have a different number
+of votes nranks and so it made more sense to work with them
+separately, following the same steps and then row bind them
+together.
+
The individual steps for each of these dataframes is to
+
+
select the relevant columns from
+resultsTidy
+
perform sums with colSums, adding together the ranks in
+those columns (each column corresponds to a queried feature); We set
+na.rm = TRUE to ignore the NAs (since not every survey
+respondent was asked each question; e.g., if they were a current user
+they weren’t asked as a potential user)
+
send those sums to a data frame such that the selected column names
+from the first step are now the row names and the total summed rank is
+the only column with values in each row corresponding to each queried
+feature
+
Use a mutate to
+
+
add a new column nranks that finds the number of
+responses in the survey are from potential users (e.g., the number that
+would have assigned ranks to the PotentialRank questions) or the number
+of responses in the survey that are from current/returning users (e.g.,
+the number that would have assigned ranks to the CurrentRank
+questions).
+
add a new column avgRank that divides the
+totalRank by the nranks
+
+
+
After these two dataframes are bound together
+(bind_rows), the rest of the steps are for aesthetics in
+plotting and making sure ggplot knows the usertype and the feature of
+interest, etc.
+
+
We move the rownames to their own column
+UsertypeFeature (with the
+mutate(UsertypeFeature = rownames(.))).
+
We separate the values in that column on the word “Rank” to remove
+the UsertypeFeature column we just made but then make two
+new columns (Usertype and Feature) where
+`Usertype is either “Current” or “Potential”, and the Features are
+listed in the code below, because…
+
We then use a case_when within a mutate()
+to fill out those features so they’re more informative and show the
+choices survey respondents were given.
+
+
+
+
+Description of variable definitions and steps for plotting the dumbbell
+plot
+
+
We use the totalRanksdf we just made. The x-axis is the
+avgRank values, and the y-axis displays the informative
+Feature values, however, we reorder the y-axis
+so that more important (lower number) avgRank features are displayed
+higher in the plot.
+
geom_point and geom_line are used in conjunction to produce the
+dumbbell look of the plot and we set the color of the points to
+correspond to the Usertype
+
Some theme things are changed, labels and titles added, setting the
+color to match AnVIL colors, and then we display and save that plot.
+
The first version of the plot has trimmed limits, so the second
+version sets limits on the x-axis of 1 to 6 since those were the options
+survey respondents were given for ranking. It also adds annotations
+(using Grobs, explained
+in this Stackoverflow post answer) to specify which rank was “Most
+important” and which was “Least important”.
+
Then we’ve also adjusted the left margin so that the annotation isn’t
+cut off.
+
We then display and save that version as well.
+
Finally, we’ll reverse the x-axis so that most important is on the
+right and least important is on the left. We use
+scale_x_reverse() for that. We have to change our group
+annotations so that they are now on the negative number version of
+xmin and xmax that we were using previously.
+We then display and save that version as well.
+
+
+
+
+
+
Preferences: Training Workshop Modality
+
Takeaway:
+
+
Prepare and plot the data
+
+
+Description of variable definitions and steps for preparing the data
+
+
+
+
+Description of variable definitions and steps for plotting the dumbbell
+plot
+
+
+
+
+
+
+
Returning User Specific: Likely to recommend?
+
Takeaway:
+
+
+
Returning User Specific: Number of years of use
+
Takeaway:
+
+
+
Returning User Specific: Foreseeable Computational Needs
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/config_automation.yml b/config_automation.yml
index 12d9dd0..fbfac6a 100644
--- a/config_automation.yml
+++ b/config_automation.yml
@@ -21,4 +21,4 @@ render-coursera: no
# What docker image should be used for rendering?
# The default is jhudsl/base_ottr:main
-rendering-docker-image: 'jhudsl/base_ottr:main'
+rendering-docker-image: 'jhudsl/anvil-poll-2024:main'
diff --git a/contact.Rmd b/contact.Rmd
deleted file mode 100644
index fc4eedc..0000000
--- a/contact.Rmd
+++ /dev/null
@@ -1,11 +0,0 @@
----
-title: "**Contact Us**"
-output:
- html_document
----
-
-
-
-If you have questions please contact:
-
-* Carrie Wright (cwrigh60@jhu.edu)
diff --git a/data/codebook.txt b/data/codebook.txt
new file mode 100644
index 0000000..6aa2093
--- /dev/null
+++ b/data/codebook.txt
@@ -0,0 +1,62 @@
+SurveyColNames SimplifiedColNames VariableFormat Description
+Timestamp Timestamp double Date and time of survey response submission
+How would you describe your current usage of the AnVIL platform? CurrentUsageDescription character "Possible answers include For completed/long-term projects (e.g., occasional updates/maintenance as needed), For ongoing projects (e.g., consistent project development and/or work), For short-term projects (e.g., short, intense bursts separated by a few months). These responses are considered current or returning AnVIL users. Other possible responses represent potential AnVIL users. These include I do no currently use the AnVIL, but have in the past, I have never heard of the AnVIL, I have never used the AnVIL, but have heard of it."
+Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Easy billing setup] PotentialRankEasyBillingSetup list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Flat-rate billing rather than use-based] PotentialRankFlatRateBilling list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Free version with limited compute or storage] PotentialRankFreeVersion list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [On demand support and documentation] PotentialRankSupportDocs list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Specific tools or datasets are available/supported] PotentialRankToolsData list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Greater adoption of the AnVIL by the scientific community] PotentialRankCommunityAdoption list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+How long have you been using the AnVIL? LengthOfUse character "This question was only given to current or returning AnVIL users. Possible responses include < 1 yr, 1-2 yrs, 2-3 yrs, 4+ yrs, or NA. NA applies to survey responses where this questions wasn't asked"
+Rank the following features or resources according to their importance for your continued use of the AnVIL [Easy billing and workgroup management] CurrentRankEasyBillingSetup list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankEasyBillingSetup. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance for your continued use of the AnVIL [Flat-rate billing rather than use-based] CurrentRankFlatRateBilling list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankFlatRateBilling. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance for your continued use of the AnVIL [Free version with limited compute or storage] CurrentRankFreeVersion list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankFreeVersion. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance for your continued use of the AnVIL [On demand support and documentation] CurrentRankSupportDocs list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankSupportDocs. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance for your continued use of the AnVIL [Specific tools or datasets are available/supported] CurrentRankToolsData list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankToolsData. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+Rank the following features or resources according to their importance for your continued use of the AnVIL [Greater adoption of the AnVIL by the scientific community] CurrentRankCommunityAdoption list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankCommunityAdoption. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Jupyter Notebooks] CurrentAnVILTechJupyterNotebooks character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Bioconductor & RStudio] CurrentAnVILTechBioconductorRStudio character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Galaxy] CurrentAnVILTechGalaxy character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [WDL Workflows] CurrentAnVILTechWDL character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Containers] CurrentAnVILTechContainers character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Unix / Command Line] CurrentAnVILTechCommandLine character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+"How would you rate your knowledge of or comfort with these AnVIL data features? [Accessing controlled access datasets (e.g., GTEx, CCDG)]" CurrentAnVILTechAccessData character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these AnVIL data features? [DUOS (Data Use Oversight System)] CurrentAnVILTechDUOS character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these AnVIL data features? [Terra on AnVIL (Workspaces)] CurrentAnVILTechTerra character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+How would you rate your knowledge of or comfort with these AnVIL data features? [TDR (Terra Data Repository)] CurrentAnVILTechTDR character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked"
+What one feature in the AnVIL do you appreciate the most? FeatureMostAppreciated character "This question was only asked to current or returning AnVIL users. Possible answers include Access to controlled access datasets, WDL Workflows, Bioconductor & RStudio on AnVIL, Jupyter Notebooks on AnVIL, Galaxy on AnVIL, Terra on AnVIL, Dockstore for workflows/containers, Available Support, Other (with free text entry if Other is selected), or NA. NA applies to survey responses where this question wasn't asked."
+What computational and storage resources do you foresee needing in the next 12 months? (Select all that apply) NeededResources character "This select all question was only asked to current or returning AnVIL users. If multiple answers were selected, the answers will be comma separated. Possible answers include Many nodes, GPUs, Lots of storage (Terabytes), Large memory (>192 GB RAM), I don't know, Other (with free text entry if Other is selected), or NA. NA applies to survey responses where this question wasn't asked."
+How likely are you to recommend the AnVIL to a colleague? RecommendationLikelihood double "This question was only asked to current or returning AnVIL users. Possible answers include 1, 2, 3, 4, 5, or NA. 1 corresponds to Not at all likely; and 5 corresponds to Extremely likely. NA applies to survey responses where this question wasn't asked."
+What is the highest degree you have attained? Degrees character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include High school or equivalent, Bachelor's degree, Master's degree in progress, Master's degree, PhD in progress, PhD, MD in progress, MD, Other (with free text entry if Other is selected)."
+Which industry do you work in? (Select all that apply) Industry character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include Academia/University, Research Institute, Clinical/Hospital, Biotech (includes start up), Pharmaceutical, Government, Other (with free text entry if Other is selected)."
+What kind of work do you do? (Check up to 2 that you do most often) KindOfWork character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include Computational work, Engineering work, Wet lab work, Clinical work, Computational education, Wet lab education, Project leadership, Project management, Program administration, Other (with free text entry if Other is selected). We asked for people to select up to 2, but several selected more than 2."
+What institution are you affiliated with? InstitutionalAffiliation character "This question was given to every survey taker, and was free response."
+"Please list any relevant consortia affiliations (e.g., your consortium performs work or stores data on the AnVIL). Please enter NA if not relevant." ConsortiaAffiliations character "This question was given to every survey taker, and was free response. Users were asked to reply with NA if not relevant. There are na and NA responses. If more than one consortium is listed, responders separated responses by commas, semicolons, ampersands, or the word and."
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Python] AllTechPython character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. There isn't a directly related question to the CurrentAnVILTech ones. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Jupyter Notebooks] AllTechJupyterNotebooks character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechJupyterNotebooks one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [R] AllTechR character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. There isn't a directly related question to the CurrentAnVILTech ones. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [RStudio] AllTechRStudio character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechRStudio one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Bioconductor] AllTechBioconductor character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechRStudio one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Galaxy] AllTechGalaxy character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechGalaxy one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+"How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Workflows (e.g., WDL)]" AllTechWorkflows character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechWDL one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Containers] AllTechContainers character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechContainers one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Unix / Command Line] AllTechCommandLine character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechCommandLine one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable"
+What types of data do you or would you analyze using the AnVIL? (Select all that apply) TypesOfData character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include Genomes/exomes, Transcriptomes, Metagenomes, Proteomes, Metabolomes, Epigenomes, Structural, Single Cell, Imaging, Phenotypic, Electronic Health Record, Metadata, Survey, and Other (with free text entry if Other is selected). Other responses seem to be either very specific or something like not analyzing data on AnVIL."
+How much experience do you have analyzing the following data categories? [Human genomic] HumanGenomicExperience character "This question was given to every survey taker. Possible answers include Not at all experienced, Slightly experienced, Somewhat experienced, Moderately experienced, or Extremely experienced."
+How much experience do you have analyzing the following data categories? [Non-human genomic] NonHumanGenomicExperience character "This question was given to every survey taker. Possible answers include Not at all experienced, Slightly experienced, Somewhat experienced, Moderately experienced, or Extremely experienced."
+How much experience do you have analyzing the following data categories? [Human clinical] HumanClinicalExperience character "This question was given to every survey taker. Possible answers include Not at all experienced, Slightly experienced, Somewhat experienced, Moderately experienced, or Extremely experienced."
+"How interested are you in working with controlled access datasets? (e.g., GTEx, CCDG)" InterestControlledData double "This question was given to every survey taker. Possible answers include 1, 2, 3, 4, 5. 1 corresponds to Not at all interested. 5 corresponds to Extremely interested."
+"What large, controlled access datasets do you access or would you be interested in accessing using the AnVIL? Those marked with an asterisk (*) are not currently available on the AnVIL due to policy restrictions. (Select all that apply)" AccessWhichControlledData character "This question was asked to all survey takers and more than one answer could be selected. If multiple answers are selected, the answers are comma separated. Possible answers include All of Us*, Centers for Common Disease Genomics (CCDG), The Centers for Mendelian Genomics (CMG), Clinical Sequencing Evidence-Generating Research (CSER), Electronic Medical Records and Genomics (eMERGE), Gabriella Miller Kids First (GMFK), Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR), The Genotype-Tissue Expression Project (GTEx), The Human Pangenome Reference Consortium (HPRC), Population Architecture Using Genomics and Epidemiology (PAGE), Undiagnosed Disease Network (UDN), UK Biobank*, None, or Other (with free text entry if Other is selected)."
+Have you attended a monthly AnVIL Demo? (Example) AnVILDemo character "This question was given to every survey taker. Possible answers include Yes, multiple; Yes, one; No, but aware of; No, didn't know of; Not yet, but am registered to."
+Have you ever read or posted in our AnVIL Support Forum? (Select all that apply) AnVILSupportForum character "This question was given to every survey taker and more than one answer could be selected. If so, the answers would be separated by commas. Possible answers include Read through others' posts, Posted in, Answered someone's post, No, but aware of, No, didn't know of. Note that the No, but and No, didn't are part of the same answer and so need to do some data cleaning there to remove the commas before the rest of data cleaning."
+Please rank how/where you would prefer to attend AnVIL training workshops. [On-site at my institution] AnVILTrainingWorkshopsOnSite list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)."
+Please rank how/where you would prefer to attend AnVIL training workshops. [Virtual] AnVILTrainingWorkshopsVirtual list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)."
+"Please rank how/where you would prefer to attend AnVIL training workshops. [Conference (e.g., CSHL, AMIA)]" AnVILTrainingWorkshopsConference list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)."
+Please rank how/where you would prefer to attend AnVIL training workshops. [AnVIL-specific event] AnVILTrainingWorkshopsSpecEvent list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)."
+Please rank how/where you would prefer to attend AnVIL training workshops. [Other] AnVILTrainingWorkshopsOther list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)."
+"If you ranked \""Other\"" highly for venues you would prefer to attend for training workshops, please specify here." OtherExamplesAnVILTrainingWorkshops character "This question was given to every survey taker and was free response. Some responses are NA or na, either entered by the responder or due to the responder not providing an answer."
+Where do you currently run analyses? (Select all that apply) WhereAnalysesRun character "This question was given to every survey taker and multiple answer choices could be selected. If multiple answers were selected, the answers will be comma separated. Possible answers include Personal Computer (locally), Institutional High Performance Computing cluster (HPC), Amazon Web Services (AWS), Google Cloud Platform (GCP), Microsoft Azure, Galaxy (usegalaxy.org), or Other (with free text entry if Other is selected)"
+"What repositories are you considering to share data (for example, to comply with the NIH DMS Policy)? (Select all that apply)" RepositoriesDMS character "This question was given to every survey taker and multiple answers could be selected. If multiple answers were selected, the answers will be comma separated. Possible answers include Your institutional repository, Zenodo, AnVIL, I'm not sure what DMS policy is, or Other (with free text entry if Other is selected)."
+What source(s) of funds do you use to pay for cloud computing? (Select all that apply) FundingSources character "This question was given to all survey takers and multiple answers could be selected. If multiple answers were selected, the answers will be comma separated. Possible answers include NHGRI, Other NIH, Foundation Grant, Institutional funds, Don't know, Only use free options, Other (with free text entry if Other is selected)"
+Are you willing to be contacted again to give input in the future? ContactWillingness character This question was given to every survey taker. Possible answers include Yes or No.
+What email address may we use to contact you in the future? Email character This question was given only to survey takers who responded yes when asked if they were willing to be contacted in the future. It was free text response with the responder providing their email. NA corresponds to survey responses where the question wasn't asked.
diff --git a/data/controlledAccessData_codebook.txt b/data/controlledAccessData_codebook.txt
new file mode 100644
index 0000000..f29d5de
--- /dev/null
+++ b/data/controlledAccessData_codebook.txt
@@ -0,0 +1,16 @@
+Dataset whichControlledAccess AnVIL_Availability
+All of Us* All of Us non-AnVIL hosted
+UK Biobank* UK Biobank non-AnVIL hosted
+Centers for Common Disease Genomics (CCDG) CCDG AnVIL hosted
+The Centers for Mendelian Genomics (CMG) CMG AnVIL hosted
+Clinical Sequencing Evidence-Generating Research (CSER) CSER AnVIL hosted
+Electronic Medical Records and Genomics (eMERGE) eMERGE AnVIL hosted
+Gabriella Miller Kids First (GMKF) GMKF non-AnVIL hosted
+Genomics Research to Elucidate the Genetics of Rare Diseases GREGoR AnVIL hosted
+The Genotype-Tissue Expression Project (GTEx GTEx AnVIL hosted
+The Human Pangenome Reference Consortium (HPRC) HPRC AnVIL hosted
+Population Architecture Using Genomics and Epidemiology (PAGE) PAGE AnVIL hosted
+Undiagnosed Disease Network (UDN) UDN non-AnVIL hosted
+Being able to pull other dbGap data as needed., Cancer omics datasets Other NA
+GnomAD and ClinVar None NA
+TCGA TCGA non-AnVIL hosted
diff --git a/data/institution_codebook.txt b/data/institution_codebook.txt
new file mode 100644
index 0000000..8336190
--- /dev/null
+++ b/data/institution_codebook.txt
@@ -0,0 +1,30 @@
+InstitutionalAffiliation CorrespondingSurveyEntries InstitutionalType
+Arizona State University Arizona State University R1 University
+Baylor College of Medicine Baylor College of Medicine Medical Center or School
+Boston Children's Hospital Boston Children's Hospital Medical Center or School
+Broad Institute "Broad, broad institute, Broad Institute" Research Center
+Carnegie Institution of Washington Carnegie Institution of Washington Research Center
+City University of New York "City University of New York, CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" R2 University
+Clovis Community College Clovis Community College Community College
+Columbia University Irving Medical Center Columbia University Irving Medical Center R1 University
+Harvard University "harvard, Harvard Public Health" R1 University
+Johns Hopkins "Johns hopkins, Johns Hopkins, Johns Hopkins University" R1 University
+Lieber Institute for Brain Development Lieber Institute for Brain Development Research Center
+Moffitt Cancer Center Moffitt Cancer Center Medical Center or School
+NHGRI NHGRI NIH
+Oregon Health & Science University "OHSU, OHSU (Knight Center)" R1 University
+Pacific Biosciences Pacific Biosciences Industry
+Penn State University Penn State University R1 University
+Stanford University Stanford University R1 University
+Ohio State University The Ohio State University R1 University
+University of California Santa Cruz "UCSC, univ. ca. santa cruz, university of California santa cruz " R1 University
+UMass Chan Medical School "Umass Chan Medical School, UMASS Chan Medical School" Medical Center or School
+University of Minnesota University of Minnesota R1 University
+University of Queensland University of Queensland International Location
+University of Texas at El Paso University of Texas at El Paso R1 University
+University of Virginia University of Virginia R1 University
+University of Washington University of Washington R1 University
+Vanderbilt University Medical Center Vanderbilt University Medical Center R1 University
+Washington University in St. Louis "Washington University in St. Louis, Washington University in St Louis" R1 University
+Yikon Genomics yikongene Industry
+Unknown v Unknown
\ No newline at end of file
diff --git a/docs/1PANHDY3T9wpEX1GBYcqkdYMG0c20UeZPH2pwjHFcLik.pptx b/docs/1PANHDY3T9wpEX1GBYcqkdYMG0c20UeZPH2pwjHFcLik.pptx
new file mode 100644
index 0000000..02e9828
Binary files /dev/null and b/docs/1PANHDY3T9wpEX1GBYcqkdYMG0c20UeZPH2pwjHFcLik.pptx differ
diff --git a/docs/hosting.html b/docs/CurrentUserQs.html
similarity index 66%
rename from docs/hosting.html
rename to docs/CurrentUserQs.html
index 6d6e522..bfdbca9 100644
--- a/docs/hosting.html
+++ b/docs/CurrentUserQs.html
@@ -11,10 +11,10 @@
-Hosting
+Current User Specific Questions
-
-
+
+
@@ -32,8 +32,8 @@
-
-
+
+
@@ -61,7 +61,6 @@
-
@@ -87,9 +86,6 @@
summary {
display: list-item;
}
-details > summary > p:only-child {
- display: inline;
-}
pre code {
padding: 0;
}
@@ -146,15 +142,11 @@
href = "index.html";
var menuAnchor = $('a[href="' + href + '"]');
- // mark the anchor link active (and if it's in a dropdown, also mark that active)
- var dropdown = menuAnchor.closest('li.dropdown');
- if (window.bootstrap) { // Bootstrap 4+
- menuAnchor.addClass('active');
- dropdown.find('> .dropdown-toggle').addClass('active');
- } else { // Bootstrap 3
- menuAnchor.parent().addClass('active');
- dropdown.addClass('active');
- }
+ // mark it active
+ menuAnchor.tab('show');
+
+ // if it's got a parent navbar menu mark it active as well
+ menuAnchor.closest('li.dropdown').addClass('active');
// Navbar adjustments
var navHeight = $(".navbar").first().height() + 15;
@@ -183,8 +175,8 @@
border-radius: 4px;
}
-.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
- content: "\e259";
+.tabset-dropdown > .nav-tabs > li.active:before {
+ content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
@@ -192,11 +184,18 @@
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
- content: "\e258";
- font-family: 'Glyphicons Halflings';
+ content: "";
border: none;
}
+.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
+ content: "";
+ font-family: 'Glyphicons Halflings';
+ display: inline-block;
+ padding: 10px;
+ border-right: 1px solid #ddd;
+}
+
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
@@ -238,12 +237,12 @@
To host your website on GitHub, you will need to go to settings and
-click on the pages tab.
-
Again to go to settings click on the far upper right corner:
-
-
Click on Pages:
-
-
Select the main branch and the root directory and press save. Be sure
-to also check the “Enforce HTTPS” box. Afterwards your repository should
-look like this:
-
-
Note that in general, your website will be published to a URL like
-this:
-
https://username.github.io/repository_name/
-
If you have multiple websites published underneath your username or
-organization, this should still publish fine. This website will be a
-subdirectory that is named whatever you have named this repository.
-
If you’ve published this website underneath a GitHub organization
-(not your own personal GitHub profile), then in the above example URL
-the organization name will be where we’ve put username.
-
For more about GitHub pages (including how to personalize your URL)
-see the GitHub pages documentation here: https://pages.github.com/
-
Sometimes, GitHub page publishing will take a bit of time. You can
-click on the Actions tab in your repository and see if
-there is a pages and deployment action currently running
-(indicated by a yellow circle next to the action name). If this is the
-case, you will need to wait until this becomes a green check mark before
-your GitHub page will be published.
+
+
+
Length of Use of the AnVIL
+
+
+
Takeaway
+
We observe a fairly even sampling of the current users with regards to the number of years they’ve been using the AnVIL
+
+
+
+
+
Foreseeable Computational Needs
+
+
+
Takeaway
+
Of the 22 current users, all 22 provided an answer to this question. The most common response here was needing large amounts of storage.
+
+
+
Potential Follow-ups
+
+
What do we see in other responses from the “don’t know” responses?
+
+
+
+
+
+
Recommendation Likelihood
+
+
+
Takeaway
+
There’s a fairly bimodal distribution here with users either extremely likely or only moderately likely to recommend the AnVIL.
+
+
+
Potential Follow-ups
+
+
What do we see in other responses from those who are extremely likely to recommend the AnVIL to a colleague? e.g., Are they part of a consortium?
+
What do we see in other responses from those who are only moderately or not at all likely to recommend the AnVIL?
+
+
diff --git a/docs/CurrentUserQs_files/figure-html/unnamed-chunk-2-1.png b/docs/CurrentUserQs_files/figure-html/unnamed-chunk-2-1.png
new file mode 100644
index 0000000..56f2b02
Binary files /dev/null and b/docs/CurrentUserQs_files/figure-html/unnamed-chunk-2-1.png differ
diff --git a/docs/CurrentUserQs_files/figure-html/unnamed-chunk-3-1.png b/docs/CurrentUserQs_files/figure-html/unnamed-chunk-3-1.png
new file mode 100644
index 0000000..ab6f076
Binary files /dev/null and b/docs/CurrentUserQs_files/figure-html/unnamed-chunk-3-1.png differ
diff --git a/docs/CurrentUserQs_files/figure-html/unnamed-chunk-4-1.png b/docs/CurrentUserQs_files/figure-html/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..8d09035
Binary files /dev/null and b/docs/CurrentUserQs_files/figure-html/unnamed-chunk-4-1.png differ
diff --git a/docs/editing.html b/docs/Demographics.html
similarity index 65%
rename from docs/editing.html
rename to docs/Demographics.html
index 97e4f4d..bac3e2d 100644
--- a/docs/editing.html
+++ b/docs/Demographics.html
@@ -11,10 +11,10 @@
-Editing
+Demographics
-
-
+
+
@@ -32,8 +32,8 @@
-
-
+
+
@@ -61,7 +61,6 @@
-
@@ -87,9 +86,6 @@
summary {
display: list-item;
}
-details > summary > p:only-child {
- display: inline;
-}
pre code {
padding: 0;
}
@@ -146,15 +142,11 @@
href = "index.html";
var menuAnchor = $('a[href="' + href + '"]');
- // mark the anchor link active (and if it's in a dropdown, also mark that active)
- var dropdown = menuAnchor.closest('li.dropdown');
- if (window.bootstrap) { // Bootstrap 4+
- menuAnchor.addClass('active');
- dropdown.find('> .dropdown-toggle').addClass('active');
- } else { // Bootstrap 3
- menuAnchor.parent().addClass('active');
- dropdown.addClass('active');
- }
+ // mark it active
+ menuAnchor.tab('show');
+
+ // if it's got a parent navbar menu mark it active as well
+ menuAnchor.closest('li.dropdown').addClass('active');
// Navbar adjustments
var navHeight = $(".navbar").first().height() + 15;
@@ -183,8 +175,8 @@
border-radius: 4px;
}
-.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
- content: "\e259";
+.tabset-dropdown > .nav-tabs > li.active:before {
+ content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
@@ -192,11 +184,18 @@
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
- content: "\e258";
- font-family: 'Glyphicons Halflings';
+ content: "";
border: none;
}
+.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
+ content: "";
+ font-family: 'Glyphicons Halflings';
+ display: inline-block;
+ padding: 10px;
+ border-right: 1px solid #ddd;
+}
+
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
@@ -238,12 +237,12 @@
Now that your website is alive and hosted, how do you start
-customizing it to have your content?
-
If you already know how to file pull requests, feel free to start
-making edits as you see fit using some of the structure points we’ve
-noted below.
To change the title of the website, modify the name line
-of the _site.yml file.
-
+
+
+
Highest Degree
+
+
+
Takeaway
+
Most of the respondents have a PhD or are currently working on a PhD, though a range of career stages are represented.
+
+
+
+
+
Kind of Work
+
+
+
Takeaway
+
Only a few responses showed project management, leadership, or administration as their only kind of work. This increases our confidence that this won’t confound later questions asking about usage of datasets or tools.
+
+
+
Potential Follow-ups
+
+
Use this together with other info to try to cluster respondents/users into personas.
+
+
+
+
+
+
Institutional Affiliation
+
+
+
Takeaway
+
Most of the individuals using AnVIL report being affiliated with a research intensive institution. A further breakdown of these counts are below.
+
+
+
Potential Follow-ups
+
+
Why is industry not represented in current users?
+
+
+
Related Plots
+
+
+
+
-
-
Modifying pages
-
Pages are specified in the navigation bar by the lines that say
--text: and href: .
-
The -text: specifies what the navigation bar will say
-for that tab.
-
The href: specifies which rendered Rmd file to use for
-that tab, it needs to be the html version of this file.
-
The tabs are specified to be aligned to the left (as specified by the
-left on line 5).
-
The tabs will appear in the order listed.
-
You can also add icons to these tabs using font awesome as is shown for the
-contact page on line 18.
-
-
Other icon options include Bootstap
-glyphicons or ion icons.
-Note that not all icons will work because they are not
-all set up with the packages that make rendering the website possible,
-so this may require some trial and error. Here is an example of how you
-would use all of these icon options to add more:
-
-
This would result in a navigation bar with these icons:
-
+
+
Consortia Affiliations
+
Of 50 responses, only 21 provided an affiliation. The most represented were
To change the part of the navigation bar that says “OTTR Web”, modify
-the title within the _site.yml file.
-
-
-
-
Overall theme
-
To change the color scheme/fonts of the website modify the
-theme in the _site.yml file (see here for
-options):
-
-
-
-
Change the favicon
-
The small image that shows up on the browser can also be changed.
-
You can make a small image to replace the existing one by going to https://favicon.io/favicon-converter/ and uploading an
-image that you would like.
-
Next, simply replace the image called favicon.ico in the
-images directory within the resources
-directory with the image you just created and downloaded from the
-favicon converter website.
+
+
+
Takeaway
+
Of the 50 responses, 22 were current users and 28 were potential users. The majority of current users belonged to the group who use the AnVIL for ongoing projects while the majority of potential users were evenly split between those who have never used the AnVIL (but have heard of it) and those who used to previously use the AnVIL, but don’t currently.
-
-
Additional changes
-
To make additional changes to the style, you can modify the
-styles.css file with css code. This website has great information
-about css code.
-
As an example if you wanted to change the color of the blue line to
-green you could change where it says lightblue to
-lightgreen in the styles.css file. You can
-also use a hex color code like those that can be found at this website, such as
-#00FF9E to get a specific shade.
-
-
Note that if you change the css file with a new element that is not
-already defined like body then you would need to do it as
-done with the banner element. This was then added to the
-index.Rmd file by using:
Look to see if those potential users who previously used to use the AnVIL show similarity in overall trends with the rest of the potential users
+
Directly ask why they no longer use the AnVIL
+
+
+
+Description of variable definitions and steps in preparing and plotting the data
+
+
First, we group the data by the assigned UserType labels/categories and their related more detailed descriptions. Then we use summarize to count the occurrences for each of those categories. We use a mutate statement to better fit the detailed descriptions on the plot. We then send this data to ggplot with the count on the x-axis, and the usage descriptions on the y-axis (ordered by count so highest count is on the top). We fill with the usertype description we’ve assigned. We manually scale the fill to be AnVIL colors and specify we want this to be a stacked bar chart. We then make edits for the theme and labels and finally add a geom_text label for the count next to the bars before we save the plot.
+
diff --git a/docs/IdentifyTypeOfUsers_files/figure-html/unnamed-chunk-2-1.png b/docs/IdentifyTypeOfUsers_files/figure-html/unnamed-chunk-2-1.png
new file mode 100644
index 0000000..87685cc
Binary files /dev/null and b/docs/IdentifyTypeOfUsers_files/figure-html/unnamed-chunk-2-1.png differ
diff --git a/docs/contact.html b/docs/contact.html
index 912fba5..0f4013c 100644
--- a/docs/contact.html
+++ b/docs/contact.html
@@ -13,8 +13,8 @@
Contact Us
-
-
+
+
@@ -32,8 +32,8 @@
-
-
+
+
@@ -61,7 +61,6 @@
-
@@ -87,9 +86,6 @@
summary {
display: list-item;
}
-details > summary > p:only-child {
- display: inline;
-}
pre code {
padding: 0;
}
@@ -146,15 +142,11 @@
href = "index.html";
var menuAnchor = $('a[href="' + href + '"]');
- // mark the anchor link active (and if it's in a dropdown, also mark that active)
- var dropdown = menuAnchor.closest('li.dropdown');
- if (window.bootstrap) { // Bootstrap 4+
- menuAnchor.addClass('active');
- dropdown.find('> .dropdown-toggle').addClass('active');
- } else { // Bootstrap 3
- menuAnchor.parent().addClass('active');
- dropdown.addClass('active');
- }
+ // mark it active
+ menuAnchor.tab('show');
+
+ // if it's got a parent navbar menu mark it active as well
+ menuAnchor.closest('li.dropdown').addClass('active');
// Navbar adjustments
var navHeight = $(".navbar").first().height() + 15;
@@ -183,8 +175,8 @@
border-radius: 4px;
}
-.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
- content: "\e259";
+.tabset-dropdown > .nav-tabs > li.active:before {
+ content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
@@ -192,11 +184,18 @@
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
- content: "\e258";
- font-family: 'Glyphicons Halflings';
+ content: "";
border: none;
}
+.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
+ content: "";
+ font-family: 'Glyphicons Halflings';
+ display: inline-block;
+ padding: 10px;
+ border-right: 1px solid #ddd;
+}
+
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
@@ -238,12 +237,12 @@
We have set up several checks for website content edits.
-
When you create a pull request, you will see something like this if
-everything is successful. You can click on the
-preview of website here link to see a preview. Please note
-that some features may not be possible to see in the preview. For
-example, icons may only show up as a box.
-
-
-
Rendering Action
-
If the rendering action fails, you will see something like this:
-
-
If you click on where it says Details on the far right,
-you will be taken to more information about what may have gone
-wrong.
-
-
For example, we can see that an R object was not found in one of the
-files. You could identify which file by scrolling up.
-
-
-
-
Spelling and Style Action
-
You may find that you have spelling errors if you get the following
-message from your pull request (PR):
-
-
If this happens, click the Download the errors here.
-link. This will take you to a table with words that the check thought
-were misspelled, as well as what file they occurred in and the lines in
-that file.
-
Add words that are not actually misspelled to the
-dictionary.txt file located in the resources
-directory. It’s a good idea to try to keep this in alphabetical
-order.
-
For words that are indeed misspelled, fix the errors and push your
-changes to your pull request.
-
You should then see that your pull request has a different message
-that tells you that you have no spelling errors.
-
-
-
URL Check
-
You may find that you have broken URLs. If so, you will get this
-message when you create a Pull Request.
-
-You can click on the Download the errors here link to see a
-document with a list of broken URLs and what files they are located in,
-like this:
-
-
Here we can see that there are two URLs that are broken in the
-git_actions.Rmd file.
-
-
-
Completing a Pull Request
-
Once all the Git Action checks pass, you can merge your pull request
-to your main branch for your website.
-
-
If you are only working on this yourself without others to review
-your pull request, you can click the
-Merge without waiting for requirements to be met box, so
-that you can click the Merge pull request button.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/docs/index.html b/docs/index.html
index fd453a8..6f0b462 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -11,10 +11,10 @@
-Website Name
+About the AnVIL Community Poll
-
-
+
+
@@ -32,8 +32,8 @@
-
-
+
+
@@ -61,7 +61,6 @@
-
@@ -87,9 +86,6 @@
summary {
display: list-item;
}
-details > summary > p:only-child {
- display: inline;
-}
pre code {
padding: 0;
}
@@ -146,15 +142,11 @@
href = "index.html";
var menuAnchor = $('a[href="' + href + '"]');
- // mark the anchor link active (and if it's in a dropdown, also mark that active)
- var dropdown = menuAnchor.closest('li.dropdown');
- if (window.bootstrap) { // Bootstrap 4+
- menuAnchor.addClass('active');
- dropdown.find('> .dropdown-toggle').addClass('active');
- } else { // Bootstrap 3
- menuAnchor.parent().addClass('active');
- dropdown.addClass('active');
- }
+ // mark it active
+ menuAnchor.tab('show');
+
+ // if it's got a parent navbar menu mark it active as well
+ menuAnchor.closest('li.dropdown').addClass('active');
// Navbar adjustments
var navHeight = $(".navbar").first().height() + 15;
@@ -183,8 +175,8 @@
border-radius: 4px;
}
-.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
- content: "\e259";
+.tabset-dropdown > .nav-tabs > li.active:before {
+ content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
@@ -192,11 +184,18 @@
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
- content: "\e258";
- font-family: 'Glyphicons Halflings';
+ content: "";
border: none;
}
+.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
+ content: "";
+ font-family: 'Glyphicons Halflings';
+ display: inline-block;
+ padding: 10px;
+ border-right: 1px solid #ddd;
+}
+
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
@@ -238,12 +237,12 @@
This website was created with the OTTR_Template_Website template.
-This enables simpler creation of websites that have automated checks for
-broken URLs and spelling errors, as well as automated rendering of all
-html files. No need to worry about git conflicts for html files!
-
Here is how you can include images in your website pages (this one is
-saved in resources/images to keep things tidy:
-
-
We will go through the following:
-
-
How to set up your repository to enable the GitHub actions of the
-template. (Setup tab)
-
How to host your website using GitHub (Hosting tab)
-
How to modify your website structure (Structure tab)
-
How to change your website style (Style tab)
-
More information about GitHub Actions (Git Actions tab)
-
+
+
+
Results Summary
+
+
The majority of current users leverage AnVIL for ongoing projects. Potential users were evenly split between never having used the AnVIL (but have heard of it) and having used AnVIL at some point previously.
+
Most of the respondents have a PhD or are currently working on a PhD.
+
Many respondents do computational work.
+
Almost all respondents are affiliated with a research intensive institution.
+
PRIMED, GREGoR, and eMERGE were the most popular consortia affiliations. CCDG and GTEx are also represented.
+
For current users, the most common computational need was needing large amounts of storage.
+
Many users reported they were “extremely likely” to recommend AnVIL, but we’d like to follow up with users who chose differently.
+
Click on each tab to learn more.
+
+
+
+
Design
+
This user poll was conducted in Spring 2024, with the poll open for responses from February 15th to March 25th. 52 total responses were received, two of which were determined to be duplicate users, leaving a total of 50 user responses used in the analysis. The following graphic shows the arrangement of questions. The first question was used to separate users into “Current” and “Potential” users. Demographics, experience, awareness, and preference related questions were asked of all users.
To enable the GitHub actions, your repository needs to be setup in a
-specific way.
-
For OTTR GitHub actions to run, they need to have credentials through
-a personal access token.
-
-
Set up your own personal access token following
-these instructions - but create a classic token.
-Keep this personal access token handy for the next step. When you get to
-the permissions page, check the box that says repo and
-select all that is underneath that header. No other permissions are
-necessary.
-
-
-
-Click here for more detailed instructions. The instructions for this
-step may change with updates to GitHub.
-
-
First, go to your username settings, by clicking on your user icon
-(upper right corner) and scrolling down to settings.
-
-
Next, scroll all the way down on the far right menu to “Developer
-Settings”.
-
-
Then select “Personal Access Tokens” and “Tokens (classic)”
-
-
Then click “Generate new token” and confirm that you want
-classic.
-
-
Finally, add a name select all the repo scopes and scroll down to the
-green button to generate the token. Copy this somewhere safe to then
-paste into your repository settings.
-
-
-
-
In your new OTTR_Template_Website derived repository, go to Settings
-> Secrets and variables > Actions. Click
-New Repository Secret.
-
-
-
-
In the window opened, name this new secret GH_PAT and
-paste the personal access token in the box below. (Note that the name
-GH_PAT is specific to how OTTR works and other secret names
-cannot be used and for OTTR to still work).
-
Click the green button to add the secret.
-
-
-
-
Allow GitHub Actions
-
Go to the settings menu for your repository that you created from the
-template. This should be located at the top of GitHub on the right
-side.
-
Scroll down to the “Actions” button and click it, then click
-“General”.
-
-
-
Scroll down to the workflow permissions section and select “Read and
-write permissions”, then click “Allow GitHub actions to create and
-approve pull requests.
-
Finally, click “save”.
-
-
-
-
Protect branches
-
Although this isn’t entirely required, its strongly recommended that
-you use these settings to protect your main branches.
-
Click on settings in the far upper right corner:
-
-Click on branches:
-
-Click the add rule button.
-
-Type “main” as the branch name pattern:
-
-Click on the following boxes to require pull requests before
-merging:
-
-
Note that if you have admin privileges on this repository, you will
-likely still be able to override these branch protections so use caution
-when git pushing!
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/editing.Rmd b/editing.Rmd
deleted file mode 100644
index e65c8f5..0000000
--- a/editing.Rmd
+++ /dev/null
@@ -1,50 +0,0 @@
----
-title: "Editing"
-output: html_document
----
-
-Now that your website is alive and hosted, how do you start customizing it to have your content?
-
-If you already know how to file pull requests, feel free to start making edits as you see fit using some of the structure points we've noted below.
-
-If you are not familiar with pull requests, [read this guide to get started](https://www.ottrproject.org/editing_website.html).
-
-
-## Title
-
-To change the title of the website, modify the `name` line of the `_site.yml` file.
-
-```{r, fig.align='center', fig.alt= "Change title", echo = FALSE, out.width="40%"}
-knitr::include_graphics("resources/images/title.png")
-```
-
-## Modifying pages
-
-Pages are specified in the navigation bar by the lines that say `-text:` and `href:` .
-
-The `-text:` specifies what the navigation bar will say for that tab.
-
-The `href:` specifies which rendered Rmd file to use for that tab, it needs to be the html version of this file.
-
-The tabs are specified to be aligned to the left (as specified by the `left` on line 5).
-
-The tabs will appear in the order listed.
-
-You can also add icons to these tabs using [font awesome](https://fontawesome.com/) as is shown for the contact page on line 18.
-
-```{r, fig.align='center', fig.alt= "Page modification", echo = FALSE, out.width="40%"}
-knitr::include_graphics("resources/images/page_modification.png")
-```
-
-Other icon options include [Bootstap glyphicons](https://www.w3schools.com/bootstrap/bootstrap_ref_comp_glyphs.asp) or [ion icons](https://ionic.io/ionicons). Note that **not all icons will work** because they are not all set up with the packages that make rendering the website possible, so this may require some trial and error. Here is an example of how you would use all of these icon options to add more:
-
-```{r, fig.align='center', fig.alt= "more icon options", echo = FALSE, out.width="40%"}
-knitr::include_graphics("resources/images/more_icons.png")
-```
-
-
-This would result in a navigation bar with these icons:
-
-```{r, fig.align='center', fig.alt= "more icon options", echo = FALSE, out.width="70%"}
-knitr::include_graphics("resources/images/more_icons_result.png")
-```
diff --git a/git_actions.Rmd b/git_actions.Rmd
deleted file mode 100644
index f53e97d..0000000
--- a/git_actions.Rmd
+++ /dev/null
@@ -1,77 +0,0 @@
----
-title: "Git Actions"
-output: html_document
----
-
-
-We have set up several checks for website content edits.
-
-When you create a pull request, you will see something like this if everything is successful. You can click on the `preview of website here` link to see a preview. Please note that some features may not be possible to see in the preview. For example, icons may only show up as a box.
-
-```{r, fig.align='center', fig.alt= "pull request checks", echo = FALSE, out.width="70%"}
-knitr::include_graphics("resources/images/checks.png")
-```
-
-## Rendering Action
-
-If the rendering action fails, you will see something like this:
-
-```{r, fig.align='center', fig.alt= "render issue pull request image", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/render_fail.png")
-```
-
-If you click on where it says `Details` on the far right, you will be taken to more information about what may have gone wrong.
-
-```{r, fig.align='center', fig.alt= "render issue pull request image", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/render_fail2.png")
-```
-
-
-For example, we can see that an R object was not found in one of the files. You could identify which file by scrolling up.
-
-```{r, fig.align='center', fig.alt= "render issue pull request image", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/render_fail_details.png")
-```
-
-## Spelling and Style Action
-
-You may find that you have spelling errors if you get the following message from your pull request (PR):
-
-```{r, fig.align='center', fig.alt= "spelling issue pull request image", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/spelling.png")
-```
-
-If this happens, click the `Download the errors here.` link. This will take you to a table with words that the check thought were misspelled, as well as what file they occurred in and the lines in that file.
-
-Add words that are **not** actually misspelled to the `dictionary.txt` file located in the `resources` directory. It's a good idea to try to keep this in alphabetical order.
-
-For words that are indeed misspelled, fix the errors and push your changes to your pull request.
-
-You should then see that your pull request has a different message that tells you that you have no spelling errors.
-
-## URL Check
-
-You may find that you have broken URLs. If so, you will get this message when you create a Pull Request.
-
-```{r, fig.align='center', fig.alt= "URL issue pull request image", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/url_check.png")
-```
-You can click on the `Download the errors here` link to see a document with a list of broken URLs and what files they are located in, like this:
-
-```{r, fig.align='center', fig.alt= "broken url list", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/urls.png")
-```
-
-Here we can see that there are two URLs that are broken in the `git_actions.Rmd` file.
-
-## Completing a Pull Request
-
-Once all the Git Action checks pass, you can merge your pull request to your main branch for your website.
-
-```{r, fig.align='center', fig.alt= "finishing a pull request", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/checks2.png")
-```
-
-If you are only working on this yourself without others to review your pull request, you can click the `Merge without waiting for requirements to be met` box, so that you can click the `Merge pull request` button.
-
-
diff --git a/hosting.Rmd b/hosting.Rmd
deleted file mode 100644
index aedd72b..0000000
--- a/hosting.Rmd
+++ /dev/null
@@ -1,39 +0,0 @@
----
-title: "Hosting"
-output: html_document
----
-
-## Hosting Setup
-
-To host your website on GitHub, you will need to go to settings and click on the pages tab.
-
-Again to go to settings click on the far upper right corner:
-
-```{r, fig.align='center', fig.alt= "Clicking on settings", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/settings.png")
-```
-
-Click on Pages:
-
-```{r, fig.align='center', fig.alt= "Clicking on pages", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/pages.png")
-```
-
-Select the main branch and the root directory and press save. Be sure to also check the "Enforce HTTPS" box. Afterwards your repository should look like this:
-
-```{r, fig.align='center', fig.alt= "setup website hosting", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/website_setup.png")
-```
-
-Note that in general, your website will be published to a URL like this:
-
-```
-https://username.github.io/repository_name/
-```
-If you have multiple websites published underneath your username or organization, this should still publish fine. This website will be a subdirectory that is named whatever you have named this repository.
-
-If you've published this website underneath a GitHub organization (not your own personal GitHub profile), then in the above example URL the organization name will be where we've put `username`.
-
-For more about GitHub pages (including how to personalize your URL) see the GitHub pages documentation here: https://pages.github.com/
-
-Sometimes, GitHub page publishing will take a bit of time. You can click on the `Actions` tab in your repository and see if there is a `pages and deployment` action currently running (indicated by a yellow circle next to the action name). If this is the case, you will need to wait until this becomes a green check mark before your GitHub page will be published.
diff --git a/index.Rmd b/index.Rmd
deleted file mode 100644
index bbdd869..0000000
--- a/index.Rmd
+++ /dev/null
@@ -1,34 +0,0 @@
----
-title: "**Website Name**"
-output:
- html_document
----
-
-
-Banner text!
-
-
-### **About**
-
-This website was created with the OTTR_Template_Website template. This enables simpler creation of websites that have automated checks for broken URLs and spelling errors, as well as automated rendering of all html files. No need to worry about git conflicts for html files!
-
-Here is how you can include images in your website pages (this one is saved in `resources/images` to keep things tidy:
-
-```{r, fig.align='center', fig.alt= "Example image", echo = FALSE, out.width="30%"}
-knitr::include_graphics("resources/images/example_image.png")
-```
-
-
-
-We will go through the following:
-
-1. How to set up your repository to enable the GitHub actions of the template. (Setup tab)
-2. How to host your website using GitHub (Hosting tab)
-3. How to modify your website structure (Structure tab)
-4. How to change your website style (Style tab)
-5. More information about GitHub Actions (Git Actions tab)
-
-Click on each tab to learn more.
-
-
-
diff --git a/pages/Awareness.Rmd b/pages/Awareness.Rmd
new file mode 100644
index 0000000..b52e14f
--- /dev/null
+++ b/pages/Awareness.Rmd
@@ -0,0 +1,123 @@
+---
+title: "Awareness"
+output: html_document
+---
+
+```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE}
+library(tidyverse)
+library(here)
+
+# Inherit `resultsTidy`
+knitr::knit_child(here("TidyData.Rmd"))
+# Import functions to stylize plots
+source(here("resources/scripts/shared_functions.R"))
+```
+
+
+
+# Monthly AnVIL Demos
+
+## Raw responses
+
+```{r echo=FALSE, message=FALSE}
+demoPlotRaw <- resultsTidy %>%
+ group_by(UserType, AnVILDemo) %>%
+ summarize(count = n()) %>%
+ ggplot(aes(y=reorder(AnVILDemo, count),
+ x = count,
+ fill = UserType)) +
+ geom_bar(stat = "identity") +
+ ggtitle("Have you attended a monthly AnVIL Demo?")
+
+stylize_bar(demoPlotRaw)
+```
+
+## Awareness
+
+```{r, message=FALSE, echo = FALSE}
+demoPlot <- resultsTidy %>%
+ group_by(UserType, AnVILDemoAwareness) %>%
+ summarize(count = n()) %>%
+ ggplot(aes(y = AnVILDemoAwareness,
+ x = count,
+ fill = UserType)) +
+ geom_bar(stat = "identity") +
+ ggtitle("Have you attended a monthly AnVIL Demo?")
+
+stylize_bar(demoPlot) +
+ ylab("Awareness")
+
+```
+
+## Takeaway
+
+Most respondents have not attended an AnVIL Demo. To investigate whether this was an awareness issue, we aggregated all responses except `No, didn't know of`. We see that the majority of respondents are aware of AnVIL Demos. These responses are just distributed among different ways of utilizing the demos. Further, there's awareness among both current and potential AnVIL users.
+
+
+
+# AnVIL Support Forum
+
+```{r, message=FALSE, echo=FALSE}
+forumdf <- resultsTidy %>%
+ mutate(AnVILSupportForum = str_replace(AnVILSupportForum,
+ pattern = "No, ",
+ replacement= "No ")) %>%
+ separate(AnVILSupportForum,
+ c("forumInteractionA", "forumInteractionB", "forumInteractionC"),
+ sep = ", ",
+ fill = "right") %>%
+ pivot_longer(starts_with("forumInteraction"), values_to = "forumInteractionDescription") %>%
+ group_by(UserType, CurrentUsageDescription, forumInteractionDescription) %>%
+ summarize(count = n()) %>%
+ drop_na() %>%
+ mutate(forumInteractionDescription =
+ factor(forumInteractionDescription, levels = c("Posted in", "Answered someone's post", "Read through others' posts", "No but aware of", "No didn't know of")),
+ forumAwareness = factor(
+ case_when(
+ forumInteractionDescription == "Posted in" ~ "Aware of",
+ forumInteractionDescription == "Answered someone's post" ~ "Aware of",
+ forumInteractionDescription == "Read through others' posts" ~ "Aware of",
+ forumInteractionDescription == "No but aware of" ~ "Aware of",
+ forumInteractionDescription == "No didn't know of" ~ "Not Aware of"
+ ), levels = c("Not Aware of", "Aware of")),
+ forumUse = factor(
+ case_when(
+ forumInteractionDescription == "Posted in" ~ "Have utilized",
+ forumInteractionDescription == "Answered someone's post" ~ "Have utilized",
+ forumInteractionDescription == "Read through others' posts" ~ "Have utilized",
+ forumInteractionDescription == "No but aware of" ~ "Have not utilized",
+ forumInteractionDescription == "No didn't know of" ~ "Have not utilized"
+ ), levels = c("Have not utilized", "Have utilized"))
+)
+```
+
+## Raw Responses
+
+```{r, message=FALSE, echo=FALSE}
+forumPlotRaw <- ggplot(forumdf,
+ aes(y = reorder(forumInteractionDescription, count),
+ x = count,
+ fill = UserType)) +
+ geom_bar(stat = "identity") +
+ ggtitle("Have you ever read or posted in our AnVIL Support Forum?")
+
+stylize_bar(forumPlotRaw)
+```
+
+## Awareness
+
+```{r, message=FALSE, echo=FALSE}
+forumPlot <- ggplot(forumdf,
+ aes(y = forumAwareness,
+ x = count,
+ fill = UserType)) +
+ geom_bar(stat = "identity") +
+ ggtitle("Have you ever read or posted in our AnVIL Support Forum?")
+
+stylize_bar(forumPlot) +
+ ylab("Awareness")
+```
+
+## Takeaway
+
+Most respondents have not used the AnVIL support forum. We aggregated these responses to examine awareness. We observe that there is awareness of the support forum across potential and current users. While utilization in some form is reported by about 20% of respondents, reading through others' posts is the most common way of utilizing the support forum within this sample.
\ No newline at end of file
diff --git a/pages/CurrentUserQs.Rmd b/pages/CurrentUserQs.Rmd
new file mode 100644
index 0000000..df87946
--- /dev/null
+++ b/pages/CurrentUserQs.Rmd
@@ -0,0 +1,116 @@
+---
+title: "Current Users"
+author: ""
+date: ""
+output: html_document
+---
+
+```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE}
+library(tidyverse)
+library(here)
+library(grid) #for Grobs
+library(scales) #pretty breaks
+
+# Inherit `resultsTidy`
+knitr::knit_child(here("TidyData.Rmd"))
+# Import functions to stylize plots
+source(here("resources/scripts/shared_functions.R"))
+```
+
+
+
+# Length of Use of the AnVIL
+
+```{r, message = FALSE, echo = FALSE}
+timeUsePlot <- resultsTidy %>%
+ group_by(LengthOfUse) %>%
+ summarize(count = n()) %>%
+ drop_na() %>%
+ ggplot(aes(x = LengthOfUse,
+ y = count,
+ fill = "#25445A")) +
+ geom_bar(stat = "identity") +
+ geom_text(aes(label = count, group = LengthOfUse),
+ vjust = -1, size=2) +
+ ggtitle("How long have you been using the AnVIL?")
+
+stylize_bar(timeUsePlot, usertypeColor = FALSE, singleColor = TRUE) +
+ xlab("Years of Use") +
+ ylab("Count") +
+ theme(legend.position = "none")
+```
+
+## Takeaway
+
+We observe a fairly even sampling of the current users with regards to the number of years they've been using the AnVIL
+
+
+
+# Foreseeable Computational Needs
+
+```{r, message=FALSE, echo=FALSE, warning=FALSE}
+compNeedsPlot <- resultsTidy %>%
+ separate(NeededResources,
+ c("whichResourceA", "whichResourceB", "whichResourceC", "whichResourceD"),
+ sep = ", ", fill = "right") %>%
+ pivot_longer(starts_with("whichResource"), values_to = "ResourceDescription") %>%
+ group_by(ResourceDescription) %>%
+ summarize(count = n()) %>%
+ drop_na() %>%
+ ggplot(aes(x = count,
+ y = reorder(ResourceDescription, count),
+ fill = "#25445A")) +
+ geom_text(aes(label = count, group = ResourceDescription),
+ hjust = -1, size=2) +
+ geom_bar(stat = "identity") +
+ ggtitle("What computational and storage resources do you foresee\nneeding in the next 12 months?")
+
+ stylize_bar(compNeedsPlot, usertypeColor = FALSE, singleColor = TRUE) +
+ theme(legend.position = "none")
+```
+
+## Takeaway
+
+Of the `r nrow(resultsTidy %>% filter(UserType == "Current User"))` current users, all `r 50 - sum(is.na(resultsTidy$NeededResources))` provided an answer to this question. The most common response here was needing large amounts of storage.
+
+## Potential Follow-ups
+
+ - What do we see in other responses from the "don't know" responses?
+
+
+
+# Recommendation Likelihood
+
+```{r, message = FALSE, echo = FALSE}
+recLikePlot <- resultsTidy %>%
+ group_by(RecommendationLikelihood) %>%
+ summarize(count = n()) %>%
+ drop_na() %>% #not asked to everyone
+ ggplot(aes(x = RecommendationLikelihood,
+ y = count,
+ fill = as.factor(RecommendationLikelihood))) +
+ geom_bar(stat="identity") +
+ ggtitle("How likely are you to recommend the AnVIL to a colleague?") +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm")) +
+ annotation_custom(textGrob("Extremely\nlikely", gp=gpar(fontsize=8, fontface = "bold")),xmin=5,xmax=5,ymin=-1.25,ymax=-1.25) +
+ annotation_custom(textGrob("Not at all\nlikely", gp=gpar(fontsize=8, fontface= "bold")),xmin=1,xmax=1,ymin=-1.25,ymax=-1.25) +
+ scale_y_continuous(breaks= pretty_breaks()) +
+ geom_text(aes(label = count, group = RecommendationLikelihood),
+ vjust = -1, size=2)
+
+ stylize_bar(recLikePlot, usertypeColor = FALSE, sequentialColor = TRUE) +
+ xlab("Recommendation likelihood") +
+ ylab("Count") +
+ theme(legend.position = "none")
+```
+
+## Takeaway
+
+There's a fairly bimodal distribution here with users either extremely likely or only moderately likely to recommend the AnVIL.
+
+## Potential Follow-ups
+
+ - What do we see in other responses from those who are extremely likely to recommend the AnVIL to a colleague? e.g., Are they part of a consortium?
+ - What do we see in other responses from those who are only moderately or not at all likely to recommend the AnVIL?
+
diff --git a/pages/Demographics.Rmd b/pages/Demographics.Rmd
new file mode 100644
index 0000000..c0d632a
--- /dev/null
+++ b/pages/Demographics.Rmd
@@ -0,0 +1,179 @@
+---
+title: "Demographics"
+author: ""
+date: ""
+output: html_document
+---
+
+```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE}
+library(tidyverse)
+library(here)
+library(grid) #for Grobs
+library(scales) #pretty breaks
+library(kableExtra)
+
+# Inherit `resultsTidy`
+knitr::knit_child(here("TidyData.Rmd"))
+# Import functions to stylize plots
+source(here("resources/scripts/shared_functions.R"))
+```
+
+
+
+# Highest Degree
+
+```{r, message=FALSE, echo=FALSE}
+degreePlot <- resultsTidy %>%
+ group_by(FurtherSimplifiedDegrees, UserType) %>%
+ summarize(n = n()) %>%
+ ggplot(aes(y = reorder(FurtherSimplifiedDegrees, n, sum),
+ x = n,
+ fill = UserType
+ )) +
+ geom_bar(position = "stack", stat="identity") +
+ geom_text(
+ aes(label = after_stat(x), group = FurtherSimplifiedDegrees),
+ stat = 'summary', fun = sum, hjust = -1, size=2
+ ) +
+ coord_cartesian(clip = "off") +
+ ggtitle("What is the highest degree you have attained?")
+
+stylize_bar(degreePlot)
+```
+
+## Takeaway
+
+Most of the respondents have a PhD or are currently working on a PhD, though a range of career stages are represented.
+
+
+
+# Kind of Work
+
+```{r, message = FALSE, echo = FALSE}
+resultsTidy %>%
+ separate(KindOfWork,
+ c("whichWorkA", "whichWorkB", "whichWorkC", "whichWorkD", "whichWorkE", "whichWorkF", "whichWorkG", "whichWorkH", "whichWorkI", "whichWorkJ"),
+ sep=", ", fill="right") %>%
+ pivot_longer(starts_with("whichWork"), values_to = "whichWorkDescription") %>%
+ select(Timestamp, UserType, whichWorkDescription) %>%
+ mutate(whichWorkDescription =
+ recode(whichWorkDescription,
+ "computational education" = "Computational education",
+ "Program administration," = "Program administration"),
+ whichWorkDescription = factor(whichWorkDescription),
+ Timestamp = factor(Timestamp)
+ ) %>%
+ drop_na() %>%
+ ggplot(aes(x = Timestamp,
+ y = whichWorkDescription,
+ fill = whichWorkDescription
+ )) +
+ geom_tile() +
+ theme_bw() +
+ theme(axis.text.x=element_blank(),
+ axis.ticks.x=element_blank(),
+ legend.position = "None") +
+ ylab("") +
+ ggtitle("What kind of work do you do?") +
+ xlab("Respondent") +
+ facet_wrap(~UserType)
+```
+
+## Takeaway
+
+Only a few responses showed project management, leadership, or administration as their only kind of work. This increases our confidence that this won't confound later questions asking about usage of datasets or tools.
+
+## Potential Follow-ups
+
+ - Use this together with other info to try to cluster respondents/users into personas.
+
+
+
+# Institutional Affiliation
+
+```{r, message=FALSE, echo = FALSE}
+instPlot <- resultsTidy %>%
+ mutate(FurtherSimplifiedInstitutionalType =
+ factor(FurtherSimplifiedInstitutionalType,
+ levels = c("Industry & Other", "Education Focused", "Research Intensive"))) %>%
+ group_by(UserType, FurtherSimplifiedInstitutionalType) %>%
+ summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = FurtherSimplifiedInstitutionalType,
+ x = InstitutionalCount,
+ fill = UserType
+ )) +
+ geom_bar(position = "stack", stat = "identity") +
+ geom_text(aes(label = after_stat(x),
+ group = FurtherSimplifiedInstitutionalType),
+ stat = 'summary', fun = sum, hjust = -1, size=2
+ ) +
+ annotation_custom(textGrob("- R1 University \n- Med Campus \n- Research Center\n- NIH ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = 2.65, ymax = 2.65) +
+ annotation_custom(textGrob("- Industry \n- International Loc\n- Unknown ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = .7, ymax = .7) +
+ annotation_custom(textGrob("- R2 University \n- Community College", gp=gpar(fontsize=8)),xmin=-8.5,xmax=-8.5,ymin=1.75,ymax=1.75) +
+ coord_cartesian(clip = "off") +
+ ggtitle("What institution are you affiliated with?")
+
+stylize_bar(instPlot)
+```
+
+## Takeaway
+
+Most of the individuals using the AnVIL report being affiliated with a research intensive institution. A further breakdown of these counts are below.
+
+## Potential Follow-ups
+
+ - Does industry not being represented in this sample of current users generalize to the larger AnVIL user base? If so, why?
+
+### Related Plots
+
+```{r, message = FALSE, echo = FALSE}
+instPlotB <- resultsTidy %>%
+ select(c(UserType, InstitutionalType)) %>%
+ group_by(UserType, InstitutionalType) %>%
+ summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = reorder(InstitutionalType, InstitutionalCount, sum),
+ x = InstitutionalCount,
+ fill = UserType
+ )) +
+ geom_bar(position = "stack", stat = "identity") +
+ geom_text(
+ aes(label = after_stat(x), group = InstitutionalType),
+ stat = 'summary', fun = sum, hjust = -1, size=2
+ ) +
+ annotation_custom(textGrob(paste("There are\n", length(unique(resultsTidy$InstitutionalAffiliation)) ,"\nunique institutions"), gp=gpar(fontsize=8, fontface = "bold")),xmin=34,xmax=34,ymin=2.5,ymax=2.5) +
+ coord_cartesian(clip = "off") +
+ ggtitle("What institution are you affiliated with?")
+
+stylize_bar(instPlotB)
+```
+
+
+
+# Consortia Affiliations
+
+```{r, message=FALSE, echo = FALSE}
+consortiaTable <- resultsTidy %>%
+ mutate(ConsortiaAffiliations = str_replace_all(ConsortiaAffiliations, c(";|&| and"), ",")) %>%
+ separate(ConsortiaAffiliations,
+ c("whichConsortiumA", "whichConsortiumB", "whichConsortiumC", "whichConsortiumD"),
+ sep=", ", fill = "right") %>%
+ pivot_longer(starts_with("whichConsortium"), values_to = "whichConsortiumName") %>%
+ group_by(whichConsortiumName) %>%
+ summarize(count = n()) %>%
+ drop_na() %>%
+ arrange(count)
+```
+
+
+Of `r nrow(resultsTidy)` responses, `r sum(!is.na(resultsTidy$ConsortiaAffiliations))` provided an affiliation, with `r nrow(consortiaTable)` unique affiliations provided across those responses (respondents could select more than one consortium). The following table shows the most represented consortia.
+
+```{r, message = FALSE, echo = FALSE}
+consortia_df <-
+ consortiaTable[which(consortiaTable$count >1),] %>%
+ rename(`consortium` = whichConsortiumName)
+
+kableExtra::kable(consortia_df, table.attr = "style='width:20%;'")
+```
+
diff --git a/pages/Experience.Rmd b/pages/Experience.Rmd
new file mode 100644
index 0000000..ca67cb0
--- /dev/null
+++ b/pages/Experience.Rmd
@@ -0,0 +1,246 @@
+---
+title: "Experience"
+output: html_document
+date: ""
+---
+
+```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE}
+library(tidyverse)
+library(here)
+library(grid) #for Grobs
+
+# Inherits `resultsTidy`
+knitr::knit_child(here("TidyData.Rmd"))
+# Import functions to stylize plots
+source(here("resources/scripts/shared_functions.R"))
+```
+
+
+
+# Tool & Resource Knowledge/Comfort level
+
+```{r, message=FALSE, echo = FALSE}
+toPlotToolKnowledge <- bind_rows(
+ resultsTidy %>%
+ filter(UserType == "Current User") %>%
+ select(starts_with("Score_")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalScore")) %>%
+ mutate(nscores = sum(resultsTidy$UserType == "Current User"),
+ avgScore = totalScore / nscores,
+ UserType = "Current User") %>%
+ mutate(WhereTool = rownames(.)) %>%
+ separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>%
+ mutate(AnVILorNo =
+ case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL",
+ AnVILorNo == "Score_All" ~ "Separate from the AnVIL"
+ ),
+ Tool =
+ recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks",
+ "WDL" = "Workflows",
+ "CommandLine" = "Unix / Command Line",
+ "AccessData" = "Access controlled access data",
+ "Terra" = "Terra Workspaces",
+ "BioconductorRStudio" = "Bioconductor & RStudio"
+ )
+ ),
+ resultsTidy %>%
+ filter(UserType == "Potential User") %>%
+ select(starts_with("Score_AllTech")) %>%
+ colSums() %>%
+ as.data.frame() %>% `colnames<-`(c("totalScore")) %>%
+ mutate(nscores = sum(resultsTidy$UserType == "Potential User"),
+ avgScore = totalScore / nscores,
+ UserType = "Potential User") %>%
+ mutate(WhereTool = rownames(.)) %>%
+ separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>%
+ mutate(AnVILorNo =
+ case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL",
+ AnVILorNo == "Score_All" ~ "Separate from the AnVIL"
+ ),
+ Tool =
+ recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks",
+ "WDL" = "Workflows",
+ "CommandLine" = "Unix / Command Line",
+ "AccessData" = "Access controlled access data",
+ "Terra" = "Terra Workspaces",
+ "BioconductorRStudio" = "Bioconductor & RStudio"
+ )
+ )
+) %>%
+ mutate(UserType = factor(UserType, levels = c("Potential User", "Current User")))
+
+roi <- toPlotToolKnowledge[which(toPlotToolKnowledge$Tool == "Bioconductor & RStudio"),]
+toPlotToolKnowledge <- rows_append(toPlotToolKnowledge, data.frame(
+ UserType = rep(roi$UserType,2),
+ avgScore = rep(roi$avgScore,2),
+ AnVILorNo = rep(roi$AnVILorNo,2),
+ Tool = c("Bioconductor", "RStudio")
+ )) %>%
+ rows_delete(., data.frame(roi))
+```
+
+```{r message=FALSE, echo=FALSE}
+ggplot(toPlotToolKnowledge,
+ aes(y = reorder(Tool, avgScore), x = avgScore)) +
+ geom_point(aes(color = UserType, shape = AnVILorNo)) +
+ scale_x_continuous(breaks = 0:5, labels = 0:5, limits = c(0,5)) +
+ ylab("Tool or Data Resource") +
+ xlab("Average Knowledge or Comfort Score") +
+ theme_bw() +
+ theme(panel.background = element_blank(),
+ panel.grid.minor.x = element_blank()) +
+annotation_custom(textGrob("Don't know\nat all", gp=gpar(fontsize=8, fontface = "bold")),xmin=0,xmax=0,ymin=-2,ymax=-2) +
+ annotation_custom(textGrob("Extremely\ncomfortable", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-2,ymax=-2) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm")) +
+ ggtitle("How would you rate your knowledge of or\ncomfort with these technologies or data features?") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ scale_shape_manual(values = c(4, 16)) +
+ theme(legend.title = element_blank())
+```
+
+## Takeaways
+
+Except for Galaxy, potential users tend to report lower comfort levels for the various tools and technologies when compared to current users. Where tools were present on and off AnVIL, current users reported similar comfort levels.
+
+Overall, there is less comfort with containers or workflows than using various programming languages and integrated development environments (IDEs).
+
+## Potential follow-ups
+
+- Potentially prioritize developing resources for utilizing the tools or data resources which correspond to lower comfort levels
+
+
+
+# Types of data analyzed
+
+TBA
+
+
+
+# Genomics and Clinical Research Experience
+
+```{r, message=FALSE, echo = FALSE}
+experienceDf <- resultsTidy %>%
+ select(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience, UserType) %>%
+ pivot_longer(c(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience),
+ names_to = "researchType",
+ values_to = "experienceLevel") %>%
+ mutate(experienceLevel =
+ factor(experienceLevel, levels = c("Not at all experienced", "Slightly experienced", "Somewhat experienced", "Moderately experienced", "Extremely experienced")),
+ researchType = case_when(
+ researchType == "HumanClinicalExperience" ~ "Human Clinical Research",
+ researchType == "HumanGenomicExperience" ~ "Human Genomic Research",
+ researchType == "NonHumanGenomicExperience" ~ "Non-human\nGenomic Research")
+ ) %>%
+ group_by(researchType, experienceLevel, UserType) %>%
+ summarize(n = n())
+
+ggplot(experienceDf, aes(x=experienceLevel,y=n, fill = experienceLevel)) +
+ facet_grid(~researchType) +
+ geom_bar(stat="identity") +
+ theme_bw() +
+ theme(panel.background = element_blank(),
+ panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle = 45, hjust=1)) +
+ geom_text(
+ aes(label = after_stat(y), group = experienceLevel),
+ stat = 'summary', fun = sum, vjust = -0.5, size=2
+ ) +
+ ylab("Count") +
+ xlab ("Reported Experience Level") +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.05, "cm")) +
+ scale_fill_manual(values = c("#035C94","#035385","#024A77","#024168", "#02395B")) +
+ theme(legend.position = "none")+
+ ggtitle("How much experience do you have analyzing\nthe following data categories?")
+```
+
+## Takeaway
+
+21 respondents reported that they were extremely experienced in analyzing human genomic data, while only 6 respondents reported that they were not at all experienced in analyzing human genomic data. However, for human clinical data and non-human genomic data, more respondents reported being not at all experienced in analyzing those data than reported being extremely experienced.
+
+## Potential Follow-ups
+
+ - What's the overlap like for those moderately or extremely experienced in these various categories?
+
+
+
+# General interest in controlled access datasets
+
+```{r message=FALSE, echo=FALSE}
+dataInterestPlot <- resultsTidy %>%
+ group_by(InterestControlledData) %>%
+ summarize(count = n()) %>%
+ ggplot(aes(x = InterestControlledData,
+ y = count,
+ fill = as.factor(InterestControlledData))) +
+ geom_bar(stat="identity") +
+ ggtitle("How interested are you in working with controlled access datasets?") +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm")) +
+ annotation_custom(textGrob("Extremely\ninterested", gp=gpar(fontsize=8, fontface = "bold")),xmin=5,xmax=5,ymin=-3.5,ymax=-3.5) +
+ annotation_custom(textGrob("Not at all\ninterested", gp=gpar(fontsize=8, fontface= "bold")),xmin=1,xmax=1,ymin=-3.5,ymax=-3.5) +
+ scale_y_continuous(breaks= pretty_breaks()) +
+ geom_text(aes(label = count, group = InterestControlledData),
+ vjust = -1, size=2)
+
+ stylize_bar(dataInterestPlot, usertypeColor = FALSE, sequentialColor = TRUE) +
+ xlab("Interest level") +
+ ylab("Count") +
+ theme(legend.position = "none")
+```
+
+## Takeaway
+
+Over half of respondents reported they are extremely interested in working with controlled access datasets.
+
+
+
+# Controlled Access Datasets (specific interest)
+
+## All responses
+
+```{r message=FALSE, echo=FALSE}
+onAnVILDF <- read_delim(here("data/controlledAccessData_codebook.txt"),
+ delim = "\t",
+ col_select = c(whichControlledAccess, AnVIL_Availability))
+
+resultsTidy %>%
+ prep_df_whichData(onAnVILDF = onAnVILDF) %>%
+ plot_which_data()
+```
+
+## Just responses from those moderately or extremely experienced with clinical data
+
+```{r message=FALSE, echo=FALSE}
+resultsTidy %>%
+ filter(clinicalFlag == TRUE) %>%
+ prep_df_whichData(onAnVILDF = onAnVILDF) %>%
+ plot_which_data(subtitle = "Respondents moderately or extremely experienced with clinical data")
+```
+
+## Just responses from those moderately or extremely experienced with human genomic data
+
+```{r message=FALSE, echo=FALSE}
+resultsTidy %>%
+ filter(humanGenomicFlag == TRUE) %>%
+ prep_df_whichData(onAnVILDF = onAnVILDF) %>%
+ plot_which_data(subtitle = "Respondents moderately or extremely experienced with human genomic data")
+```
+
+## Just responses from those moderately or extremely experienced with non-human genomic data
+
+```{r message=FALSE, echo=FALSE}
+resultsTidy %>%
+ filter(nonHumanGenomicFlag == TRUE) %>%
+ prep_df_whichData(onAnVILDF = onAnVILDF) %>%
+ plot_which_data(subtitle = "Respondents moderately or extremely experienced with non-human genomic data")
+```
+
+## Takeaway
+
+Of the survey provided choices, respondents were particularly interested in accessing [All of Us](https://www.researchallofus.org/), [UK Biobank](https://www.ukbiobank.ac.uk/enable-your-research/about-our-data), and [GTEx](https://anvilproject.org/data/consortia/GTEx) (though All of Us and UK Biobank are not currently AnVIL hosted). 2 respondents (moderately or extremely experienced with genomic data) specifically wrote in ["TCGA"](https://www.cancer.gov/ccg/research/genome-sequencing/tcga). The trend of All of Us, UK Biobank, and GTEx being chosen the most was consistent across all 3 research categories (moderately or extremely experienced with clinical, human genomic, or non-human genomic data).
+
+
+
diff --git a/pages/IdentifyTypeOfUsers.Rmd b/pages/IdentifyTypeOfUsers.Rmd
new file mode 100644
index 0000000..48f2032
--- /dev/null
+++ b/pages/IdentifyTypeOfUsers.Rmd
@@ -0,0 +1,48 @@
+---
+title: "Identify current vs potential users"
+author: ""
+date: ""
+output: html_document
+---
+
+```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE}
+library(tidyverse)
+library(here)
+
+# Inherit `resultsTidy`
+knitr::knit_child(here("TidyData.Rmd"))
+# Import functions to stylize plots
+source(here("resources/scripts/shared_functions.R"))
+```
+
+```{r, message=FALSE, echo=FALSE}
+topPlot <- resultsTidy %>%
+ group_by(UserType, CurrentUsageDescription) %>%
+ summarize(count = n()) %>%
+ mutate(CurrentUsageDescription = case_when(
+ CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "For ongoing projects:\nconsistent project development\nand/or work",
+ CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "For completed/long-term projects:\noccasional updates/maintenance\nas needed",
+ CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "For short-term projects:\nshort, intense bursts\nseparated by a few months",
+ CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "I do not current use the AnVIL,\nbut have in the past",
+ CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "I have never\nused the AnVIL",
+ CurrentUsageDescription == "I have never heard of the AnVIL" ~ "I have never\nheard of the AnVIL"
+ )) %>%
+ ggplot(aes(x = count,
+ y = reorder(CurrentUsageDescription, count),
+ fill = UserType)) +
+ geom_bar(stat="identity", position ="stack") +
+ ggtitle("How would you describe your current usage\nof the AnVIL platform?") +
+ geom_text(aes(label = count, group = CurrentUsageDescription),
+ hjust = -0.5, size=2)
+
+stylize_bar(topPlot)
+```
+
+## Takeaway
+
+Of the `r nrow(resultsTidy)` responses, `r nrow(resultsTidy %>% filter(UserType == "Current User"))` were current users and `r nrow(resultsTidy %>% filter(UserType == "Potential User"))` were potential users. The majority of current users belonged to the group who use the AnVIL for ongoing projects while the majority of potential users were evenly split between those who have never used the AnVIL (but have heard of it) and those who used to previously use the AnVIL, but don't currently.
+
+## Potential Follow-ups:
+
+- Look to see if those potential users who previously used to use the AnVIL show similarity in overall trends with the rest of the potential users
+- Directly ask why they no longer use the AnVIL
\ No newline at end of file
diff --git a/pages/Preferences.Rmd b/pages/Preferences.Rmd
new file mode 100644
index 0000000..72ce9a2
--- /dev/null
+++ b/pages/Preferences.Rmd
@@ -0,0 +1,192 @@
+---
+title: "Preferences"
+output: html_document
+---
+
+```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE}
+library(tidyverse)
+library(here)
+library(grid) #for Grobs
+library(scales) #pretty breaks
+
+# Inherit `resultsTidy`
+knitr::knit_child(here("TidyData.Rmd"))
+# Import functions to stylize plots
+source(here("resources/scripts/shared_functions.R"))
+```
+
+# Feature Importance Ranking
+
+```{r, message=FALSE, echo = FALSE}
+totalRanksdf <-
+ bind_rows(
+ resultsTidy %>%
+ select(starts_with("PotentialRank")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "Potential User"),
+ avgRank = totalRank / nranks),
+ resultsTidy %>%
+ select(starts_with("CurrentRank")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "Current User"),
+ avgRank = totalRank /nranks)
+ ) %>%
+ mutate(UsertypeFeature = rownames(.)) %>%
+ separate(UsertypeFeature, c("Usertype", "Feature"), sep = "Rank", remove = TRUE) %>%
+ mutate(Feature =
+ case_when(Feature == "EasyBillingSetup" ~ "Easy billing setup",
+ Feature == "FlatRateBilling" ~ "Flat-rate billing rather than use-based",
+ Feature == "FreeVersion" ~ "Free version with limited compute or storage",
+ Feature == "SupportDocs" ~ "On demand support and documentation",
+ Feature == "ToolsData" ~ "Specific tools or datasets are available/supported",
+ Feature == "CommunityAdoption" ~ "Greater adoption of the AnVIL by the scientific community"),
+ Usertype = factor(case_when(Usertype == "Potential" ~ "Potential User",
+ Usertype == "Current" ~ "Current User"), levels = c("Potential User", "Current User"))
+ )
+```
+
+
+```{r message=FALSE, echo=FALSE}
+gdumbbell <- ggplot(totalRanksdf,
+ aes(x = avgRank,
+ y = reorder(Feature, -avgRank))) +
+ geom_line() +
+ geom_point(aes(color = Usertype), size = 3) +
+ ggtitle("Rank the following features\naccording to their importance to\nyou as a potential user or for\nyour continued use of the AnVIL")
+
+
+stylize_dumbbell(gdumbbell, xmax=6, importance = TRUE)
+```
+
+## Takeaways
+
+All respondents rate having specific tools or datasets supported/available as a very important feature for using AnVIL. Compared to current users, potential users rate having a free-version with limited compute or storage as the most important feature for their potential use of the AnVIL.
+
+## Potential Follow-ups
+
+ - Ask what specific tools people want available/supported
+
+
+
+# Training Workshop Modality Ranking
+
+```{r, message=FALSE, echo=FALSE}
+toPlotTrainingRanks <- bind_rows(
+ resultsTidy %>%
+ filter(UserType == "Current User") %>%
+ select(starts_with("AnVILTrainingWorkshops")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "Current User"),
+ avgRank = totalRank / nranks,
+ UserType = "Current User") %>%
+ mutate(TrainingType = rownames(.)) %>%
+ mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", "")),
+ resultsTidy %>%
+ filter(UserType == "Potential User") %>%
+ select(starts_with("AnVILTrainingWorkshops")) %>%
+ colSums() %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "Potential User"),
+ avgRank = totalRank / nranks,
+ UserType = "Potential User") %>%
+ mutate(TrainingType = rownames(.)) %>%
+ mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", ""))
+ ) %>% mutate(TrainingType = recode(TrainingType, "SpecEvent" = "AnVIL-specific event", "OnSite" = "On-site at my institution", "Conference" = "Conference (e.g., CSHL, AMIA)")) %>%
+ mutate(UserType = factor(UserType, levels = c("Potential User", "Current User")))
+
+```
+
+```{r, message=FALSE, echo=FALSE}
+tdumbbell <- ggplot(toPlotTrainingRanks,
+ aes(x = avgRank,
+ y = reorder(TrainingType, -avgRank))) +
+ geom_line() +
+ geom_point(aes(color = UserType), size = 3) +
+
+ ggtitle("Please rank how/where you would prefer to attend\nAnVIL training workshops.")
+
+stylize_dumbbell(tdumbbell, xmax=5, preference = TRUE)
+```
+
+## Takeaway
+
+Both current and potential users vastly prefer virtual training workshops.
+
+
+
+# Where analyses are currently run
+
+```{r message=FALSE, echo=FALSE}
+whereRunPlot <- resultsTidy %>%
+ separate(WhereAnalysesRun,
+ c("whereRunA", "whereRunB", "whereRunC", "whereRunD", "whereRunE", "whereRunF", "whereRunG"),
+ sep = ", ", fill = "right") %>%
+ pivot_longer(starts_with("whereRun"), values_to = "wherePlatforms") %>%
+ mutate(wherePlatforms =
+ recode(wherePlatforms,
+ "Amazon Web Services (AWS)" = "AWS",
+ "Galaxy (usegalaxy.org)" = "Galaxy",
+ "Galaxy Australia" = "Galaxy",
+ "Google Cloud Platform (GCP)" = "GCP",
+ "Institutional High Performance Computing cluster (HPC)" = "Institutional HPC",
+ "Personal computer (locally)," = "Personal computer (locally)",
+ "local server" = "Institutional HPC")
+ ) %>%
+ group_by(UserType, wherePlatforms) %>%
+ summarize(count = n()) %>%
+ drop_na() %>%
+ ggplot(aes(x = count,
+ y = reorder(wherePlatforms, count),
+ fill = UserType)) +
+ geom_bar(stat="identity") +
+ ggtitle("Where do you currently run analyses?")
+
+stylize_bar(whereRunPlot) +
+ ylab("Platform")
+```
+
+## Takeaways
+
+Institutional HPC and locally/personal computers are the most common responses. Google Cloud Platform (GCP) is reported as used more than other cloud providers within this sample. We also see that potential users report using Galaxy (a free option) more than current users do.
+
+
+
+# DMS compliance/data repositories
+
+TBA
+
+
+
+# Source for cloud computing funds
+
+```{r message=FALSE, echo=FALSE}
+plotFundingSource <- resultsTidy %>%
+ separate(FundingSources,
+ c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG"),
+ sep = ", ",
+ fill="right") %>%
+ pivot_longer(starts_with("Which"),
+ names_to = "WhichChoice",
+ values_to = "whichFundingSource") %>%
+ drop_na(whichFundingSource) %>%
+ group_by(whichFundingSource, UserType) %>%
+ summarize(count = n()) %>%
+ ggplot(aes(y = reorder(whichFundingSource,count),
+ x = count,
+ fill = UserType)) +
+ geom_bar(position = "stack", stat = "identity") +
+ ggtitle("What source(s) of funds do you use to pay for cloud computing?")
+
+stylize_bar(plotFundingSource) +
+ ylab("Funding Source")
+
+```
+
+## Takeaway
+
+NIH funds (NHGRI or otherwise) as well as institutional funds are the most commonly reported funding sources.
+
+
diff --git a/pages/_site.yml b/pages/_site.yml
new file mode 100644
index 0000000..a4cdae6
--- /dev/null
+++ b/pages/_site.yml
@@ -0,0 +1,36 @@
+name: State of the AnVIL
+output_dir: '../docs'
+navbar:
+ title: "AnVIL Poll 2024"
+ left:
+ - text: ""
+ href: index.html
+ icon: fa-home
+ - text: "Identify User Type"
+ href: IdentifyTypeOfUsers.html
+ - text: "Demographics"
+ href: Demographics.html
+ - text: "Experience"
+ href: Experience.html
+ - text: "Awareness"
+ href: Awareness.html
+ - text: "Preferences"
+ href: Preferences.html
+ - text: "Current Users"
+ href: "CurrentUserQs.html"
+ - text: "Contact"
+ href: contact.html
+ icon: fa-envelope
+ - text: AnVIL Home
+ href: https://www.anvilproject.org/
+
+
+output:
+ html_document:
+ theme: cosmo
+ lib_dir: site_libs
+ self_contained: no
+ highlight: textmate
+ css: styles.css
+ includes:
+ in_header: resources/header.html
diff --git a/pages/contact.Rmd b/pages/contact.Rmd
new file mode 100644
index 0000000..ab46f44
--- /dev/null
+++ b/pages/contact.Rmd
@@ -0,0 +1,13 @@
+---
+title: "**Contact Us**"
+output:
+ html_document
+---
+
+
+
+If you have questions please contact:
+
+* Ava Hoffman (ahoffma2@fredhutch.org)
+* Kate Isaac (kisaac@fredhutch.org)
+* https://help.anvilproject.org/
\ No newline at end of file
diff --git a/pages/index.Rmd b/pages/index.Rmd
new file mode 100644
index 0000000..2b1d852
--- /dev/null
+++ b/pages/index.Rmd
@@ -0,0 +1,32 @@
+---
+title: "**About the AnVIL Community Poll**"
+output:
+ html_document
+---
+
+
+
+### **Results Summary**
+
+- The majority of current users leverage AnVIL for ongoing projects. Potential users were evenly split between never having used the AnVIL (but have heard of it) and having used AnVIL at some point previously.
+- Most of the respondents have a PhD or are currently working on a PhD.
+- Many respondents do computational work.
+- Almost all respondents are affiliated with a research intensive institution.
+- PRIMED, GREGoR, and eMERGE were the most popular consortia affiliations. CCDG and GTEx are also represented.
+- *For current users*, the most common computational need was needing large amounts of storage.
+- Many users reported they were "extremely likely" to recommend AnVIL, but we'd like to follow up with users who chose differently.
+
+Click on each tab to learn more.
+
+
+
+### **Design**
+
+This user poll was conducted in Spring 2024, with the poll open for responses from February 15th to March 25th. 52 total responses were received, two of which were determined to be duplicate users, leaving a total of **50 user responses** used in the analysis. The following graphic shows the arrangement of questions. The first question was used to separate users into "Current" and "Potential" users. Demographics, experience, awareness, and preference related questions were asked of all users.
+
+```{r, fig.align='center', fig.alt= "Example image", echo = FALSE, out.width="100%"}
+ottrpal::include_slide("https://docs.google.com/presentation/d/1PANHDY3T9wpEX1GBYcqkdYMG0c20UeZPH2pwjHFcLik/edit#slide=id.g2dda935efac_0_0")
+```
+
+
+
diff --git a/resources/header.html b/pages/resources/header.html
similarity index 100%
rename from resources/header.html
rename to pages/resources/header.html
diff --git a/resources/images/actions.png b/pages/resources/images/actions.png
similarity index 100%
rename from resources/images/actions.png
rename to pages/resources/images/actions.png
diff --git a/resources/images/add_rule.png b/pages/resources/images/add_rule.png
similarity index 100%
rename from resources/images/add_rule.png
rename to pages/resources/images/add_rule.png
diff --git a/resources/images/blue.png b/pages/resources/images/blue.png
similarity index 100%
rename from resources/images/blue.png
rename to pages/resources/images/blue.png
diff --git a/resources/images/branch_rule.png b/pages/resources/images/branch_rule.png
similarity index 100%
rename from resources/images/branch_rule.png
rename to pages/resources/images/branch_rule.png
diff --git a/resources/images/branches.png b/pages/resources/images/branches.png
similarity index 100%
rename from resources/images/branches.png
rename to pages/resources/images/branches.png
diff --git a/resources/images/checks.png b/pages/resources/images/checks.png
similarity index 100%
rename from resources/images/checks.png
rename to pages/resources/images/checks.png
diff --git a/resources/images/checks2.png b/pages/resources/images/checks2.png
similarity index 100%
rename from resources/images/checks2.png
rename to pages/resources/images/checks2.png
diff --git a/resources/images/classic_tokens.png b/pages/resources/images/classic_tokens.png
similarity index 100%
rename from resources/images/classic_tokens.png
rename to pages/resources/images/classic_tokens.png
diff --git a/resources/images/create-repo.png b/pages/resources/images/create-repo.png
similarity index 100%
rename from resources/images/create-repo.png
rename to pages/resources/images/create-repo.png
diff --git a/resources/images/developer_settings.png b/pages/resources/images/developer_settings.png
similarity index 100%
rename from resources/images/developer_settings.png
rename to pages/resources/images/developer_settings.png
diff --git a/resources/images/emoji.png b/pages/resources/images/emoji.png
similarity index 100%
rename from resources/images/emoji.png
rename to pages/resources/images/emoji.png
diff --git a/resources/images/example_image.png b/pages/resources/images/example_image.png
similarity index 100%
rename from resources/images/example_image.png
rename to pages/resources/images/example_image.png
diff --git a/pages/resources/images/favicon.ico b/pages/resources/images/favicon.ico
new file mode 100755
index 0000000..f1eb1c4
Binary files /dev/null and b/pages/resources/images/favicon.ico differ
diff --git a/resources/images/general.png b/pages/resources/images/general.png
similarity index 100%
rename from resources/images/general.png
rename to pages/resources/images/general.png
diff --git a/resources/images/generate_classic.png b/pages/resources/images/generate_classic.png
similarity index 100%
rename from resources/images/generate_classic.png
rename to pages/resources/images/generate_classic.png
diff --git a/resources/images/gh_pat_set_up.png b/pages/resources/images/gh_pat_set_up.png
similarity index 100%
rename from resources/images/gh_pat_set_up.png
rename to pages/resources/images/gh_pat_set_up.png
diff --git a/resources/images/icons.png b/pages/resources/images/icons.png
similarity index 100%
rename from resources/images/icons.png
rename to pages/resources/images/icons.png
diff --git a/resources/images/main_branch.png b/pages/resources/images/main_branch.png
similarity index 100%
rename from resources/images/main_branch.png
rename to pages/resources/images/main_branch.png
diff --git a/resources/images/more_icons.png b/pages/resources/images/more_icons.png
similarity index 100%
rename from resources/images/more_icons.png
rename to pages/resources/images/more_icons.png
diff --git a/resources/images/more_icons_result.png b/pages/resources/images/more_icons_result.png
similarity index 100%
rename from resources/images/more_icons_result.png
rename to pages/resources/images/more_icons_result.png
diff --git a/resources/images/navbar.png b/pages/resources/images/navbar.png
similarity index 100%
rename from resources/images/navbar.png
rename to pages/resources/images/navbar.png
diff --git a/resources/images/page_modification.png b/pages/resources/images/page_modification.png
similarity index 100%
rename from resources/images/page_modification.png
rename to pages/resources/images/page_modification.png
diff --git a/resources/images/pages.png b/pages/resources/images/pages.png
similarity index 100%
rename from resources/images/pages.png
rename to pages/resources/images/pages.png
diff --git a/resources/images/render_error.png b/pages/resources/images/render_error.png
similarity index 100%
rename from resources/images/render_error.png
rename to pages/resources/images/render_error.png
diff --git a/resources/images/render_fail.png b/pages/resources/images/render_fail.png
similarity index 100%
rename from resources/images/render_fail.png
rename to pages/resources/images/render_fail.png
diff --git a/resources/images/render_fail2.png b/pages/resources/images/render_fail2.png
similarity index 100%
rename from resources/images/render_fail2.png
rename to pages/resources/images/render_fail2.png
diff --git a/resources/images/render_fail_details.png b/pages/resources/images/render_fail_details.png
similarity index 100%
rename from resources/images/render_fail_details.png
rename to pages/resources/images/render_fail_details.png
diff --git a/resources/images/repo_action_token.png b/pages/resources/images/repo_action_token.png
similarity index 100%
rename from resources/images/repo_action_token.png
rename to pages/resources/images/repo_action_token.png
diff --git a/resources/images/set_up_token.png b/pages/resources/images/set_up_token.png
similarity index 100%
rename from resources/images/set_up_token.png
rename to pages/resources/images/set_up_token.png
diff --git a/resources/images/setting_gh_pat.png b/pages/resources/images/setting_gh_pat.png
similarity index 100%
rename from resources/images/setting_gh_pat.png
rename to pages/resources/images/setting_gh_pat.png
diff --git a/resources/images/settings.png b/pages/resources/images/settings.png
similarity index 100%
rename from resources/images/settings.png
rename to pages/resources/images/settings.png
diff --git a/resources/images/spelling.png b/pages/resources/images/spelling.png
similarity index 100%
rename from resources/images/spelling.png
rename to pages/resources/images/spelling.png
diff --git a/resources/images/theme.png b/pages/resources/images/theme.png
similarity index 100%
rename from resources/images/theme.png
rename to pages/resources/images/theme.png
diff --git a/resources/images/title.png b/pages/resources/images/title.png
similarity index 100%
rename from resources/images/title.png
rename to pages/resources/images/title.png
diff --git a/resources/images/url_check.png b/pages/resources/images/url_check.png
similarity index 100%
rename from resources/images/url_check.png
rename to pages/resources/images/url_check.png
diff --git a/resources/images/urls.png b/pages/resources/images/urls.png
similarity index 100%
rename from resources/images/urls.png
rename to pages/resources/images/urls.png
diff --git a/resources/images/usersettings.png b/pages/resources/images/usersettings.png
similarity index 100%
rename from resources/images/usersettings.png
rename to pages/resources/images/usersettings.png
diff --git a/resources/images/website_setup.png b/pages/resources/images/website_setup.png
similarity index 100%
rename from resources/images/website_setup.png
rename to pages/resources/images/website_setup.png
diff --git a/resources/images/workflows.png b/pages/resources/images/workflows.png
similarity index 100%
rename from resources/images/workflows.png
rename to pages/resources/images/workflows.png
diff --git a/plots/AnVILPoll2024Flowchart.png b/plots/AnVILPoll2024Flowchart.png
new file mode 100644
index 0000000..7d40bce
Binary files /dev/null and b/plots/AnVILPoll2024Flowchart.png differ
diff --git a/plots/dataresources_comfortscore.png b/plots/dataresources_comfortscore.png
new file mode 100644
index 0000000..98de027
Binary files /dev/null and b/plots/dataresources_comfortscore.png differ
diff --git a/plots/degree_furthersimplified_usertype.png b/plots/degree_furthersimplified_usertype.png
new file mode 100644
index 0000000..29490e0
Binary files /dev/null and b/plots/degree_furthersimplified_usertype.png differ
diff --git a/plots/degree_usertype.png b/plots/degree_usertype.png
new file mode 100644
index 0000000..4692f6d
Binary files /dev/null and b/plots/degree_usertype.png differ
diff --git a/plots/densityplot_rankfeatures.png b/plots/densityplot_rankfeatures.png
new file mode 100644
index 0000000..1831c4c
Binary files /dev/null and b/plots/densityplot_rankfeatures.png differ
diff --git a/plots/densityplot_rankfeatures_faceted.png b/plots/densityplot_rankfeatures_faceted.png
new file mode 100644
index 0000000..ed98e6c
Binary files /dev/null and b/plots/densityplot_rankfeatures_faceted.png differ
diff --git a/plots/dumbbellplot_rankfeatures.png b/plots/dumbbellplot_rankfeatures.png
new file mode 100644
index 0000000..604fe80
Binary files /dev/null and b/plots/dumbbellplot_rankfeatures.png differ
diff --git a/plots/dumbbellplot_trainingmodalitypref.png b/plots/dumbbellplot_trainingmodalitypref.png
new file mode 100644
index 0000000..422aa3e
Binary files /dev/null and b/plots/dumbbellplot_trainingmodalitypref.png differ
diff --git a/plots/dumbbellplot_xlim15_revaxis_trainingmodalitypref.png b/plots/dumbbellplot_xlim15_revaxis_trainingmodalitypref.png
new file mode 100644
index 0000000..41a6ee1
Binary files /dev/null and b/plots/dumbbellplot_xlim15_revaxis_trainingmodalitypref.png differ
diff --git a/plots/dumbbellplot_xlim15_trainingmodalitypref.png b/plots/dumbbellplot_xlim15_trainingmodalitypref.png
new file mode 100644
index 0000000..98dafd5
Binary files /dev/null and b/plots/dumbbellplot_xlim15_trainingmodalitypref.png differ
diff --git a/plots/dumbbellplot_xlim16_rankfeatures.png b/plots/dumbbellplot_xlim16_rankfeatures.png
new file mode 100644
index 0000000..65e3c56
Binary files /dev/null and b/plots/dumbbellplot_xlim16_rankfeatures.png differ
diff --git a/plots/dumbbellplot_xlim16_revaxis_rankfeatures.png b/plots/dumbbellplot_xlim16_revaxis_rankfeatures.png
new file mode 100644
index 0000000..16f9f7a
Binary files /dev/null and b/plots/dumbbellplot_xlim16_revaxis_rankfeatures.png differ
diff --git a/plots/fundingsources.png b/plots/fundingsources.png
new file mode 100644
index 0000000..b8b4370
Binary files /dev/null and b/plots/fundingsources.png differ
diff --git a/plots/fundingsources_colorSource.png b/plots/fundingsources_colorSource.png
new file mode 100644
index 0000000..aeb108d
Binary files /dev/null and b/plots/fundingsources_colorSource.png differ
diff --git a/plots/institutionalAffilition_allResponses.png b/plots/institutionalAffilition_allResponses.png
new file mode 100644
index 0000000..b346465
Binary files /dev/null and b/plots/institutionalAffilition_allResponses.png differ
diff --git a/plots/institutionalAffilition_currentUserResponses.png b/plots/institutionalAffilition_currentUserResponses.png
new file mode 100644
index 0000000..1667b25
Binary files /dev/null and b/plots/institutionalAffilition_currentUserResponses.png differ
diff --git a/plots/institutionalAffilition_potentialUserResponses.png b/plots/institutionalAffilition_potentialUserResponses.png
new file mode 100644
index 0000000..51276f2
Binary files /dev/null and b/plots/institutionalAffilition_potentialUserResponses.png differ
diff --git a/plots/institutionalType_allResponses_colorUserType.png b/plots/institutionalType_allResponses_colorUserType.png
new file mode 100644
index 0000000..1875518
Binary files /dev/null and b/plots/institutionalType_allResponses_colorUserType.png differ
diff --git a/plots/institutionalType_faceteduserType.png b/plots/institutionalType_faceteduserType.png
new file mode 100644
index 0000000..deed2fa
Binary files /dev/null and b/plots/institutionalType_faceteduserType.png differ
diff --git a/plots/institutionalType_simplified_allResponses_colorUserType.png b/plots/institutionalType_simplified_allResponses_colorUserType.png
new file mode 100644
index 0000000..bf27bbd
Binary files /dev/null and b/plots/institutionalType_simplified_allResponses_colorUserType.png differ
diff --git a/plots/researchExperienceLevel_colorExperience.png b/plots/researchExperienceLevel_colorExperience.png
new file mode 100644
index 0000000..4fd7861
Binary files /dev/null and b/plots/researchExperienceLevel_colorExperience.png differ
diff --git a/plots/researchExperienceLevel_colorExperienceLevel_noUserTypeSplit.png b/plots/researchExperienceLevel_colorExperienceLevel_noUserTypeSplit.png
new file mode 100644
index 0000000..d376dc7
Binary files /dev/null and b/plots/researchExperienceLevel_colorExperienceLevel_noUserTypeSplit.png differ
diff --git a/plots/researchExperienceLevel_colorResearchType.png b/plots/researchExperienceLevel_colorResearchType.png
new file mode 100644
index 0000000..6f0d3f6
Binary files /dev/null and b/plots/researchExperienceLevel_colorResearchType.png differ
diff --git a/plots/researchExperienceLevel_noColor_noUserTypeSplit.png b/plots/researchExperienceLevel_noColor_noUserTypeSplit.png
new file mode 100644
index 0000000..0684e4e
Binary files /dev/null and b/plots/researchExperienceLevel_noColor_noUserTypeSplit.png differ
diff --git a/plots/researchExperienceLevel_sequentialColor_noUserTypeSplit.png b/plots/researchExperienceLevel_sequentialColor_noUserTypeSplit.png
new file mode 100644
index 0000000..7e8095b
Binary files /dev/null and b/plots/researchExperienceLevel_sequentialColor_noUserTypeSplit.png differ
diff --git a/plots/respondent_usagedescription.png b/plots/respondent_usagedescription.png
new file mode 100644
index 0000000..a52bb10
Binary files /dev/null and b/plots/respondent_usagedescription.png differ
diff --git a/plots/stackedbarplot_rankfeatures.png b/plots/stackedbarplot_rankfeatures.png
new file mode 100644
index 0000000..803e678
Binary files /dev/null and b/plots/stackedbarplot_rankfeatures.png differ
diff --git a/plots/tooldataresourcecomfortscore_singlepanel.png b/plots/tooldataresourcecomfortscore_singlepanel.png
new file mode 100644
index 0000000..fa8e3ec
Binary files /dev/null and b/plots/tooldataresourcecomfortscore_singlepanel.png differ
diff --git a/plots/tooldataresourcecomfortscore_singlepanel_by_potential_users.png b/plots/tooldataresourcecomfortscore_singlepanel_by_potential_users.png
new file mode 100644
index 0000000..d6e759f
Binary files /dev/null and b/plots/tooldataresourcecomfortscore_singlepanel_by_potential_users.png differ
diff --git a/plots/toolsSeparateFromAnVIL_comfortscore.png b/plots/toolsSeparateFromAnVIL_comfortscore.png
new file mode 100644
index 0000000..e61c841
Binary files /dev/null and b/plots/toolsSeparateFromAnVIL_comfortscore.png differ
diff --git a/plots/tools_comfortscore.png b/plots/tools_comfortscore.png
new file mode 100644
index 0000000..25a3ca6
Binary files /dev/null and b/plots/tools_comfortscore.png differ
diff --git a/plots/typesOfData.png b/plots/typesOfData.png
new file mode 100644
index 0000000..38bd67e
Binary files /dev/null and b/plots/typesOfData.png differ
diff --git a/plots/typesOfData_clinical.png b/plots/typesOfData_clinical.png
new file mode 100644
index 0000000..441913b
Binary files /dev/null and b/plots/typesOfData_clinical.png differ
diff --git a/plots/typesOfData_humangenomic.png b/plots/typesOfData_humangenomic.png
new file mode 100644
index 0000000..cdd3df8
Binary files /dev/null and b/plots/typesOfData_humangenomic.png differ
diff --git a/plots/whichcontrolleddata.png b/plots/whichcontrolleddata.png
new file mode 100644
index 0000000..1ccfe73
Binary files /dev/null and b/plots/whichcontrolleddata.png differ
diff --git a/plots/whichcontrolleddata_clinical.png b/plots/whichcontrolleddata_clinical.png
new file mode 100644
index 0000000..d4dd60b
Binary files /dev/null and b/plots/whichcontrolleddata_clinical.png differ
diff --git a/plots/whichcontrolleddata_humangenomic.png b/plots/whichcontrolleddata_humangenomic.png
new file mode 100644
index 0000000..3455cf2
Binary files /dev/null and b/plots/whichcontrolleddata_humangenomic.png differ
diff --git a/plots/whichcontrolleddata_nonhumangenomic.png b/plots/whichcontrolleddata_nonhumangenomic.png
new file mode 100644
index 0000000..9760307
Binary files /dev/null and b/plots/whichcontrolleddata_nonhumangenomic.png differ
diff --git a/resources/.DS_Store b/resources/.DS_Store
index 6956cf3..e955f78 100644
Binary files a/resources/.DS_Store and b/resources/.DS_Store differ
diff --git a/resources/dictionary.txt b/resources/dictionary.txt
index c3dd255..f7e3889 100644
--- a/resources/dictionary.txt
+++ b/resources/dictionary.txt
@@ -1,23 +1,100 @@
+AllTech
+AMIA
+AnVIL
+AnVILTrainingWorkshops
+automations
+avgRank
+AWS
+Biobank
+Bootstap
+capitalizations
+CCDG
cheatsheet
+ClinVar
+CMG
+codebook
+CSER
+CSHL
css
+CurrentAnVILTech
+CurrentRank
+CurrentRankCommunityAdoption
+CurrentRankEasyBillingSetup
+CurrentRankFlatRateBilling
+CurrentRankFreeVersion
+CurrentRankSupportDocs
+CurrentRankToolsData
+CurrentUser
custimization
cwrigh
+dbGap
Dockerfile
Dockerhub
+eMERGE
+Epigenomes
+exomes
+FALSEs
favicon
+Gabriella
+GCP
+ggplot
GH
Github
GitHub
+glyphicons
+GMKF
+GnomAD
+GREGoR
+GTEx
+HPRC
HTTPS
+Humphries
+IDEs
+InstitutionalAffiliation
+InstitutionalType
+Jupyter
lightblue
lightgreen
+MDs
+Metabolomes
+Metagenomes
+na
+NA's
+NCPI
+NHGRI
+omics
OTTR
+Pangenome
+PotentialRank
+PotentialRankCommunityAdoption
+PotentialRankEasyBillingSetup
+PotentialRankFlatRateBilling
+PotentialRankFreeVersion
+PotentialRankSupportDocs
+PotentialRankToolsData
+PotentialUser
+Proteomes
+recode
+Recode
+recoded
+recoding
+repo
+respondant
Rmd
+RStudio
subdir
+TCGA
+TDR
th
+tidyverse
+Transcriptomes
+TRUEs
+UDN
+Undiagnosed
+Usertype
+UserType
+WDL
+WhichA
+WhichN
+Workspaces
www
-automations
-Bootstap
-glyphicons
-repo
-RStudio
diff --git a/resources/images/.DS_Store b/resources/images/.DS_Store
deleted file mode 100644
index 5008ddf..0000000
Binary files a/resources/images/.DS_Store and /dev/null differ
diff --git a/resources/images/favicon.ico b/resources/images/favicon.ico
deleted file mode 100755
index 36c1256..0000000
Binary files a/resources/images/favicon.ico and /dev/null differ
diff --git a/resources/render.R b/resources/render.R
new file mode 100644
index 0000000..251e501
--- /dev/null
+++ b/resources/render.R
@@ -0,0 +1,24 @@
+# Render any materials relying on googlesheets4 package
+
+# Make sure to set appropriate secrets to allow auto-rendering to happen.
+# This can be done via settings > secrets > actions menus.
+# Key should be generated in json format via the https://console.cloud.google.com/
+# interface, by creating a key that can be used by a Google Service Account.
+
+# Note that credentials need to be created in the pull_request.yml or
+# render-all.yml workflows first. Any Google Sheets in question must also be
+# shared with the Google Service Account.
+
+# --------- Authenticate ---------
+
+library(googlesheets4)
+
+gs4_deauth()
+gs4_auth(
+ token = gargle::credentials_service_account(path = paste0(
+ ".secrets/", grep(".json$", list.files(".secrets"), value = TRUE)
+ ),
+ scopes = "https://www.googleapis.com/auth/spreadsheets")
+)
+
+rmarkdown::render_site(input = 'pages')
diff --git a/resources/scripts/shared_functions.R b/resources/scripts/shared_functions.R
new file mode 100644
index 0000000..ee3955d
--- /dev/null
+++ b/resources/scripts/shared_functions.R
@@ -0,0 +1,115 @@
+#!/usr/bin/env Rscript
+
+library(magrittr)
+library(tidyverse)
+
+stylize_bar <- function(gplot, usertypeColor = TRUE, singleColor = FALSE, sequentialColor = FALSE){
+ if (usertypeColor) {
+ fillColors <- c("#E0DD10", "#035C94")
+ }
+ else if (singleColor){
+ fillColors <- c("#25445A")
+ }
+ else if (sequentialColor){
+ fillColors <- c("#035C94","#035385","#024A77","#024168", "#02395B")
+ }
+ return(
+ gplot +
+ theme_classic() +
+ ylab("") +
+ xlab("Count") +
+ theme(legend.title = element_blank()) +
+ scale_fill_manual(values = fillColors, na.translate = F)
+ )
+}
+
+stylize_dumbbell <- function(gplot, xmax = NULL, importance = FALSE, preference = FALSE){
+ if (importance){
+ textGrobMost <- "Most\nimportant"
+ textGrobLeast <- "Least\nimportant"
+ }
+ else if (preference){
+ textGrobMost <- "Most\npreferred"
+ textGrobLeast <- "Least\npreferred"
+ }
+ return(
+ gplot +
+ theme_bw() +
+ theme(panel.background = element_blank(),
+ legend.position = "bottom",
+ legend.title = element_blank()) +
+ xlab("Average Rank Choice") +
+ ylab("") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm")) +
+ scale_x_reverse(limits = c(xmax,1), breaks = xmax:1, labels = xmax:1) +
+ annotation_custom(textGrob(textGrobMost, gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob(textGrobLeast, gp=gpar(fontsize=8, fontface= "bold")),xmin=-xmax,xmax=-xmax,ymin=-0.5,ymax=-0.5)
+ )
+}
+
+prep_df_whichData <- function(subset_df, onAnVILDF = NULL){
+ subset_df %<>% separate(AccessWhichControlledData,
+ c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG", "WhichH", "WhichI", "WhichJ", "WhichK", "WhichM", "WhichN"),
+ sep = ", ", fill="right") %>%
+ pivot_longer(starts_with("Which"),
+ names_to = "WhichChoice",
+ values_to = "whichControlledAccess") %>%
+ drop_na(whichControlledAccess) %>%
+ group_by(whichControlledAccess) %>%
+ summarize(count = n()) %>%
+ mutate(whichControlledAccess =
+ recode(whichControlledAccess,
+ "All of Us*" = "All of Us",
+ "UK Biobank*" = "UK Biobank",
+ "Centers for Common Disease Genomics (CCDG)" = "CCDG",
+ "The Centers for Mendelian Genomics (CMG)" = "CMG",
+ "Clinical Sequencing Evidence-Generating Research (CSER)" = "CSER",
+ "Electronic Medical Records and Genomics (eMERGE)" = "eMERGE",
+ "Gabriella Miller Kids First (GMKF)" = "GMKF",
+ "Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR)" = "GREGoR",
+ "The Genotype-Tissue Expression Project (GTEx)" = "GTEx",
+ "The Human Pangenome Reference Consortium (HPRC)" = "HPRC",
+ "Population Architecture Using Genomics and Epidemiology (PAGE)" = "PAGE",
+ "Undiagnosed Disease Network (UDN)" = "UDN",
+ "Being able to pull other dbGap data as needed." = "Other",
+ "Cancer omics datasets" = "Other",
+ "GnomAD and ClinVar" = "None", #not controlled access
+ )
+ ) %>%
+ left_join(onAnVILDF, by="whichControlledAccess")
+
+ return(subset_df)
+}
+
+plot_which_data <- function(inputToPlotDF, subtitle = NULL){
+
+ toreturnplot <- ggplot(inputToPlotDF,
+ aes(
+ x = reorder(whichControlledAccess, -count),
+ y = count,
+ fill = AnVIL_Availability)
+ ) +
+ geom_bar(stat="identity") +
+ theme_classic() +
+ theme(panel.background = element_blank(),
+ panel.grid = element_blank(),
+ axis.text.x = element_text(angle=45, hjust=1),
+ legend.position = "inside",
+ legend.position.inside = c(0.8, 0.8)
+ ) +
+ xlab("Controlled access datasets") +
+ ylab("Count") +
+ ggtitle("What large, controlled access datasets do you access\nor would you be interested in accessing using the AnVIL?",
+ subtitle = subtitle) +
+ geom_text(aes(label = after_stat(y), group = whichControlledAccess),
+ stat = 'summary',
+ fun = sum,
+ vjust = -1,
+ size=2) +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("#25445A", "#7EBAC0", "grey"))
+
+ return(toreturnplot)
+}
diff --git a/setup.Rmd b/setup.Rmd
deleted file mode 100644
index f067187..0000000
--- a/setup.Rmd
+++ /dev/null
@@ -1,133 +0,0 @@
----
-title: "Setup"
-output: html_document
----
-
-
-### Getting started
-
-Create your repository by clicking on the `Use this Template` button at [OTTR_Template_Website repository](https://github.com/jhudsl/OTTR_Template_Website)
-
-![](resources/images/create-repo.png)
-
-You'll need to make your repository `public`.
-
-### Set your GH_PAT
-
-To enable the GitHub actions, your repository needs to be setup in a specific way.
-
-For OTTR GitHub actions to run, they need to have credentials through a personal access token.
-
-1. Set up your own personal access token [following these instructions](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) - but create a **classic token**. Keep this personal access token handy for the next step. When you get to the permissions page, check the box that says `repo` and select all that is underneath that header. No other permissions are necessary.
-
-Click here for more detailed instructions. The instructions for this step may change with updates to GitHub.
-
-First, go to your username settings, by clicking on your user icon (upper right corner) and scrolling down to settings.
-
-```{r, fig.align='center', fig.alt= "User settings", echo = FALSE, out.width="20%"}
-knitr::include_graphics("resources/images/usersettings.png")
-```
-
-
-Next, scroll all the way down on the far right menu to "Developer Settings".
-
-```{r, fig.align='center', fig.alt= "User settings", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/developer_settings.png")
-```
-
-
-Then select "Personal Access Tokens" and "Tokens (classic)"
-
-```{r, fig.align='center', fig.alt= "classic tokens", echo = FALSE, out.width="40%"}
-knitr::include_graphics("resources/images/classic_tokens.png")
-```
-
-Then click "Generate new token" and confirm that you want classic.
-
-```{r, fig.align='center', fig.alt= "generate classic token", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/generate_classic.png")
-```
-
-Finally, add a name select all the repo scopes and scroll down to the green button to generate the token. Copy this somewhere safe to then paste into your repository settings.
-
-```{r, fig.align='center', fig.alt= "set up token", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/set_up_token.png")
-```
-
-
-
-2. In your new OTTR_Template_Website derived repository, go to Settings > Secrets and variables > Actions. Click `New Repository Secret`.
-
-```{r, fig.align='center', fig.alt= "classic tokens", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/repo_action_token.png")
-```
-
-
-```{r, fig.align='center', fig.alt= "new repository secret", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/gh_pat_set_up.png")
-```
-
-
-In the window opened, name this new secret `GH_PAT` and paste the personal access token in the box below. (Note that the name `GH_PAT` is specific to how OTTR works and other secret names cannot be used and for OTTR to still work).
-
-Click the green button to add the secret.
-
-```{r, fig.align='center', fig.alt= "Clicking on settings", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/setting_gh_pat.png")
-```
-
-### Allow GitHub Actions
-
-Go to the settings menu for your repository that you created from the template. This should be located at the top of GitHub on the right side.
-
-Scroll down to the "Actions" button and click it, then click "General".
-```{r, fig.align='center', fig.alt= "allowing workflows", echo = FALSE, out.width="40%"}
-knitr::include_graphics("resources/images/actions.png")
-```
-
-```{r, fig.align='center', fig.alt= "allowing workflows", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/general.png")
-```
-
-
-
-Scroll down to the workflow permissions section and select "Read and write permissions", then click "Allow GitHub actions to create and approve pull requests.
-
-Finally, click "save".
-
-```{r, fig.align='center', fig.alt= "allowing workflows", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/workflows.png")
-```
-
-### Protect branches
-
-
-Although this isn't entirely required, its strongly recommended that you use these settings to protect your `main` branches.
-
-Click on settings in the far upper right corner:
-
-```{r, fig.align='center', fig.alt= "Clicking on settings", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/settings.png")
-```
-Click on branches:
-
-```{r, fig.align='center', fig.alt= "Clicking on settings", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/branches.png")
-```
-Click the add rule button.
-
-```{r, fig.align='center', fig.alt= "Adding rule", echo = FALSE}
-knitr::include_graphics("resources/images/add_rule.png")
-```
-Type "main" as the branch name pattern:
-
-```{r, fig.align='center', fig.alt= "adding main branch rule", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/main_branch.png")
-```
-Click on the following boxes to require pull requests before merging:
-
-```{r, fig.align='center', fig.alt= "modify branch rules", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/branch_rule.png")
-```
-
-Note that if you have admin privileges on this repository, you will likely still be able to override these branch protections so use caution when git pushing!
diff --git a/sota24analysis.Rmd b/sota24analysis.Rmd
new file mode 100644
index 0000000..3af4d02
--- /dev/null
+++ b/sota24analysis.Rmd
@@ -0,0 +1,1872 @@
+---
+title: "State of the AnVIL 2024"
+author: "Kate Isaac, Elizabeth Humphries, & Ava Hoffman"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+```{r}
+library(googlesheets4)
+library(tidyverse)
+library(magrittr) #for %<>%
+library(here)
+library(grid) #for Grobs and unit()
+library(ggrepel) #for geom_text_repel()
+library(patchwork)
+```
+
+# Read in data
+
+The google sheet we are reading in is stored in an AnVIL Google drive folder `State of the AnVIL 2024`. Its permissions are restricted such that only people with access can open with the link. Using `gs4_auth()` to authorize my google account before running this code, I needed to change the `scopes` argument, specifically `scopes=spreadsheets.readonly` was necessary.
+
+In this google sheet, each question is a column, and each response to the survey is a row. If the respondent wasn't asked or didn't answer a specific question, there is an NA in the corresponding row/column.
+
+```{r}
+gs4_auth(email = "kathryn.j.isaac@gmail.com", scopes="spreadsheets.readonly")
+resultsRaw <-
+ googlesheets4::read_sheet(
+ "https://docs.google.com/spreadsheets/d/1wDMNC6BD2AaIwh_GOkPTpl1tvAyLwVBQgAvOD2rYrX0/edit?usp=sharing",
+ na = c("NA", "na", ""))
+```
+
+# Clean data
+
+## Set Column Names
+
+Description of variable definitions and steps
+
+We have a codebook that is a tab delimited file and has 4 columns, and each row represents a question in the survey. The first column lists a/the question from the survey (`SurveyColNames`); the second column lists a corresponding simplified column name for that survey question (`SimplifedColNames`); the third column describes the variable format (`VariableFormat`), e.g, is it a double, or a character; the fourth column gives a lengthier description of the question (`Description`), e.g., who was asked it, what possible answers are, etc.
+
+This code block reads in that codebook and specifically selects the `SimplifiedColNames` column. It then renames the column names of the raw results from the google sheet (where each question is a column) with these simplified column names.
+
+
+
+```{r}
+simplifiedColNames <-
+ read_delim(here("data/codebook.txt"),
+ delim = "\t",
+ col_select = SimplifiedColNames)
+resultsTidy <-
+ resultsRaw %>% `colnames<-`(unlist(simplifiedColNames))
+```
+
+## Keep last response if duplicated according to email (if email provided)
+
+Choosing to select the last response because the respondent may have spent more time thinking about how they wanted to respond after their initial response.
+
+Description of variable definitions and steps
+
+* The `table` function tabulates the number of occurrences, and we tell it to ignore literal NAs. Because providing an email was optional, we expect many NA responses. The `table` function, by ignoring NAs, will return the unique emails and the number of times each email was used. We store the tabulated results in the variable `tabulatedEmails`
+* Using the `sum` function, we look to see how many emails/responses are provided more than once. `tabulatedEmails > 1` is returning a vector of TRUEs and FALSEs where TRUE means that there was more than one instance/count of a given email and FALSE means there wasn't. The `sum` function in essence counts the number of TRUEs and if the `sum` is greater than 0, that means there is at least one duplicated email whose count is greater than 1.
+* `duplicatedEmails` reports which emails are duplicated by using the tabulated/table of emails. First it identifies which emails were observed more than once, using the `which` function, and uses the indices returned from that to index the `names` of the tabulated emails, grabbing the specific emails.
+* We want to know which entries from the overall survey responses to remove for each duplicated email. Ideally, we want to remove the responses all at the same time or go backwards removing one at a time, because we don't want to affect downstream indices. The approach here, keeps track of all the indices of interest and removed them at the same time.
+ * Therefore, we'll use `lapply` to loop through the duplicated emails (`duplicatedEmails`) and grab the index for survey responses associated with that email address (`which(resultsTidy$Email == duplicatedEmails[x])`).
+ * However, we want to keep the last survey response for each duplicated email. Therefore, we wrap that `which` function in `head(_,-1 )` function so that it grabs all indices except the last one.
+ * Finally, we `unlist` the indices so that there's a single vector associated with indices for any duplicated email responses to be removed `IDXs_to_remove`. And since we want to remove them all at the same time, we subset `resultsTidy`, grabbing every row except those in `IDXs_to_remove`, as denoted by the `-`.
+
+
+
+```{r}
+
+tabulatedEmails <- table(resultsTidy$Email, useNA = "no")
+
+if (sum(tabulatedEmails > 1) > 0) {
+ duplicatedEmails <-
+ names(tabulatedEmails)[which(tabulatedEmails > 1)]
+ IDXs_to_remove <-
+ unlist(lapply(1:length(duplicatedEmails), function(x)
+ head(
+ which(resultsTidy$Email == duplicatedEmails[x]),-1
+ )))
+ resultsTidy <- resultsTidy[-IDXs_to_remove, ]
+}
+
+nrow(resultsTidy)
+```
+
+## Identify type of user
+
+Question and possible answers
+
+> How would you describe your current usage the AnVIL platform?
+
+Possible answers include:
+
+* For completed/long-term projects (e.g., occasional updates/maintenance as needed)
+* For ongoing projects (e.g., consistent project development and/or work)
+* For short-term projects (e.g., short, intense bursts separated by a few months)
+* I do no currently use the AnVIL, but have in the past
+* I have never heard of the AnVIL
+* I have never used the AnVIL, but have heard of it.
+
+The first three possible answers represent current or returning AnVIL users. The last three possible answers represent potential AnVIL users.
+
+
+
+Description of variable definitions and steps
+
+We use `case_when` to evaluate the response in the `CurrentUsageDescription` column and assign a corresponding, simplified label of "CurrentUser" or "PotentialUser'. In other words we translate the given response to a user label. Using the `case_when` as the internal nested function of the `mutate` function, means that the translation is then saved in a new column, `UserType`.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ UserType = case_when(
+ CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "CurrentUser",
+ CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "CurrentUser",
+ CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "CurrentUser",
+ CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "PotentialUser",
+ CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "PotentialUser",
+ CurrentUsageDescription == "I have never heard of the AnVIL" ~ "PotentialUser"
+ )
+ ) %>%
+ mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser")))
+```
+
+## Synchronize Institution Names
+
+Question and possible answers
+
+> What institution are you affiliated with?
+
+Free response for answers
+
+
+
+This synchronization corrects for the various spellings and capitalizations used for the same institution (ex, Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses).
+
+Description of variable definitions and steps
+
+We use a `recode()` within a `mutate()` to synchronize the institutional affiliations as necessary
+
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ InstitutionalAffiliation =
+ recode(
+ InstitutionalAffiliation,
+ "Broad" = "Broad Institute",
+ "broad institute" = "Broad Institute",
+ "CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" = "City University of New York",
+ "harvard" = "Harvard University",
+ "Harvard Public Health" = "Harvard University",
+ "Johns hopkins" = "Johns Hopkins",
+ "Johns Hopkins University" = "Johns Hopkins",
+ "OHSU" = "Oregon Health & Science University",
+ "OHSU (Knight Center)" = "Oregon Health & Science University",
+ "The Ohio State University" = "Ohio State University",
+ "UCSC" = "University of California Santa Cruz",
+ "univ. ca. santa cruz" = "University of California Santa Cruz",
+ "university of California santa cruz" = "University of California Santa Cruz",
+ "UMASS Chan Medical School" = "UMass Chan Medical School",
+ "Umass Chan Medical School" = "UMass Chan Medical School",
+ "Washington University in St Louis" = "Washington University in St. Louis",
+ "yikongene" = "Yikon Genomics",
+ "v" = "Unknown"
+ )
+ )
+```
+
+## Highest degree attained
+
+Question and possible answers What is the highest degree you have attained?
+
+Possible answers include (and multiple choices could be selected and would be comma separated if so)
+
+* High school or equivalent
+* Bachelor's degree
+* Master's degree in progress
+* Master's degree
+* PhD in progress
+* PhD
+* MD in progress
+* MD
+* Other (with free text entry)
+
+
+
+Description of variable definitions and steps
+
+Because multiple responses could be selected and those would be comma separated and because free text response was possible if other was selected, we need to tidy the data from this question. From visual inspection of the data, I see that the only time multiple responses were selected were for MD/PhD. No other's were selected. So we'll just recode "PhD, MD" to be "MD/PhD"
+
+Let's also set the factor levels to follow the general progress of degrees
+
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ Degrees =
+ factor(recode(Degrees, "PhD, MD" = "MD/PhD"), levels = c("High School or equivalent", "Bachelor's degree", "Master's degree in progress", "Master's degree", "PhD in progress", "PhD", "MD in progress", "MD", "MD/PhD"))
+ )
+```
+
+## Simplified experience status for various research categories (clinical, human genomics, non-human genomics)
+
+Want to add three columns that act as flags reporting if the respondent is
+
+* experienced with clinical research, specifically either moderately or extremely experienced in working with human clinical data
+* experienced with human genomics research, specifically is moderately or extremely experienced in working with human genomics data
+* experienced with non-human genomics research expert, specifically is moderately or extremely experienced in working with non-human genomics data
+
+We will use this information later to subset responses when considering popular tools or datasets.
+
+Description of variable definitions and steps
+
+We use a `mutate` together with 3 `case_when`'s.
+
+* If the `HumanClinicalExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human clinical research expert in the `clinicalFlag` column (`TRUE`). Otherwise, we mark a `FALSE` to signify they are not a clinical research expert.
+* If the `HumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human genomic research expert in the `humanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert.
+* If the `NonHumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a non-human genomic research expert in the `nonHumanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert.
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(
+ clinicalFlag = case_when(
+ HumanClinicalExperience == "Moderately experienced" | HumanClinicalExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE
+ ),
+ humanGenomicFlag = case_when(
+ HumanGenomicExperience == "Moderately experienced" | HumanGenomicExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE
+ ),
+ nonHumanGenomicFlag = case_when(NonHumanGenomicExperience == "Moderately experienced" | NonHumanGenomicExperience == "Extremely experienced" ~ TRUE,
+ .default = FALSE)
+ )
+```
+
+# Demographics
+
+## Institutional Affiliations
+
+Elizabeth Humphries grouped institutional affiliations into a limited set of categories: R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown and we notated those groupings/labels within the `institution_codebook.txt` data file, . Grouping into limited institutional affiliation categories allows us to consolidate free answers for easier data visualization and identification of trends.
+
+Description of variable definitions and steps
+
+We use a `read_delim()` to read in the institution_codebook file, and select just the `InstitutionalAffiliation` and `InstitutionalType` columns (ignoring the column that specifies how institutions were entered by survey respondents). We then use a full_join by the `InstitutionalAffiliation` column to add an `InstitutionalType` column such that the category labels are now included as a new column, joining the appropriate values dependent upon the `InstitutionalAffiliation` column.
+
+
+
+```{r}
+institutionCodeBook <- read_delim(here("data/institution_codebook.txt"), delim="\t", col_select = c(InstitutionalAffiliation, InstitutionalType))
+
+resultsTidy <- full_join(resultsTidy, institutionCodeBook, by = "InstitutionalAffiliation")
+```
+### Further simplify Institutional Affiliations to focus on Research Intensive, Education Focused, and Industry & Other
+
+```{r}
+resultsTidy %<>%
+ mutate(FurtherSimplifiedInstitutionalType =
+ case_when(
+ InstitutionalType == "R1 University" ~ "Research Intensive",
+ InstitutionalType == "Research Center" ~ "Research Intensive",
+ InstitutionalType == "Medical Center or School" ~ "Research Intensive",
+ InstitutionalType == "NIH" ~ "Research Intensive",
+ InstitutionalType == "R2 University" ~ "Education Focused",
+ InstitutionalType == "Community College" ~ "Education Focused",
+ InstitutionalType == "Industry" ~ "Industry & Other",
+ InstitutionalType == "International Location" ~ "Industry & Other",
+ InstitutionalType == "Unknown" ~ "Industry & Other"
+ )
+ )
+```
+
+
+### Number of institutions represented in responses
+
+```{r}
+length(unique(resultsTidy$InstitutionalAffiliation))
+```
+
+### Institution type
+
+Let's make a bar chart that shows how many of each institution, colored by institution type
+
+Description of variable definitions and steps
+
+We first prepare the data by selecting the columns of interest from `resultsTidy`: `InstitutionalAffiliation` and `InstitutionalType`. And we use the `group_by` and `summarize( = n())` functions to add a count (`InstitutionalCount`) for every InstitutionalAffiliation. We want to include the InstitutionalType in the group_by even though it's redundant for what we're displaying since we'll want to color by institution type.
+
+We then plot the data with the Affiliation on the y-axis (reordered by the count so largest count is on top),
+the count on the x-axis, and the fill color being the institutional type.
+
+We change some theme and label elements and add a grob annotation to specify how many unique institutions are represented in this graph.
+
+
+
+```{r}
+resultsTidy %>%
+ group_by(InstitutionalAffiliation, InstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = reorder(InstitutionalAffiliation, InstitutionalCount),
+ x = InstitutionalCount,
+ fill = InstitutionalType
+ )) + geom_bar(stat = "identity") +
+ theme_bw() +
+ theme(
+ panel.background = element_blank(),
+ panel.grid.minor = element_blank(),
+ panel.grid.major.y = element_blank()
+ ) +
+ ylab("Institutional Affiliation") + xlab("Count") +
+ ggtitle("What institution are you affiliated with?")+
+ annotation_custom(textGrob(paste("There are\n", length(unique(resultsTidy$InstitutionalAffiliation)) ,"\nunique institutions"), gp=gpar(fontsize=8, fontface = "bold")),xmin=7,xmax=7,ymin=3,ymax=3) +
+ coord_cartesian(clip = "off")
+
+ggsave(here("plots/institutionalAffilition_allResponses.png"))
+```
+
+Taking a less granular approach, and aggregating by institution type rather than looking at names of institutions
+
+```{r}
+resultsTidy %>%
+ mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser"))) %>%
+ group_by(UserType, InstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = reorder(InstitutionalType, InstitutionalCount, sum),
+ x = InstitutionalCount,
+ fill = UserType
+ )) + geom_bar(position = "stack", stat = "identity") +
+ theme_bw() +
+ theme(
+ panel.background = element_blank(),
+ panel.grid.minor.y = element_blank(),
+ panel.grid.major.y = element_blank()
+ ) +
+ ylab("Institutional Affiliation") + xlab("Count") +
+ ggtitle("Institutional Affiliation for All Survey Respondents")+
+ annotation_custom(textGrob(paste("There are\n", length(unique(resultsTidy$InstitutionalAffiliation)) ,"\nunique institutions"), gp=gpar(fontsize=8, fontface = "bold")),xmin=34,xmax=34,ymin=2.5,ymax=2.5) +
+ coord_cartesian(clip = "off") +
+ #scale_fill_manual(values = c("#035C94", "#E0DD10")) +
+ scale_fill_manual(values = c("#E0DD10", "#035C94")) +
+ ggtitle("What institution are you affiliated with?")
+
+ggsave(here("plots/institutionalType_allResponses_colorUserType.png"))
+
+```
+
+Further simplify it even more
+
+```{r}
+resultsTidy %>%
+ mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser"))) %>%
+ mutate(FurtherSimplifiedInstitutionalType = factor(FurtherSimplifiedInstitutionalType, levels = c("Industry & Other", "Education Focused", "Research Intensive"))) %>%
+ group_by(UserType, FurtherSimplifiedInstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = FurtherSimplifiedInstitutionalType,
+ x = InstitutionalCount,
+ fill = UserType
+ )) + geom_bar(position = "stack", stat = "identity") +
+ theme_bw() +
+ theme(
+ panel.background = element_blank(),
+ panel.grid.minor.y = element_blank(),
+ panel.grid.major.y = element_blank()
+ ) +
+ ylab("") +
+ xlab("Count") +
+ ggtitle("Institutional Affiliation for All Survey Respondents") +
+ annotation_custom(textGrob("- R1 University \n- Med Campus \n- Research Center\n- NIH ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = 2.65, ymax = 2.65) +
+ annotation_custom(textGrob("- Industry \n- International Loc\n- Unknown ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = .7, ymax = .7) +
+ annotation_custom(textGrob("- R2 University \n- Community College", gp=gpar(fontsize=8)),xmin=-8.5,xmax=-8.5,ymin=1.75,ymax=1.75) +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("#E0DD10", "#035C94")) +
+ ggtitle("What institution are you affiliated with?")
+
+ggsave(here("plots/institutionalType_simplified_allResponses_colorUserType.png"))
+
+```
+
+
+#### Just for Current/Returning Users
+
+The above plot was for all survey responses. Here we want to focus on institutions represented by just current users of AnVIL.
+
+Description of variable definitions and steps
+
+We first select rows/responses that are just from Current users. Then we prepare the data and plot following the same scheme as above.
+
+
+
+
+```{r}
+resultsTidy %>%
+ filter(UserType == "CurrentUser") %>%
+ group_by(InstitutionalAffiliation, InstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = reorder(InstitutionalAffiliation, InstitutionalCount),
+ x = InstitutionalCount,
+ fill = InstitutionalType
+ )) + geom_bar(stat = "identity") +
+ theme_bw() +
+ theme(
+ panel.background = element_blank(),
+ panel.grid.minor = element_blank(),
+ panel.grid.major.y = element_blank()
+ ) +
+ ylab("Institutional Affiliation") + xlab("Count") +
+ ggtitle(bquote('Institutional Affilition for' ~ bold('Current User') ~ 'Respondents')) +
+ annotation_custom(textGrob(paste("There are\n", nrow(unique(resultsTidy[which(resultsTidy$UserType == "CurrentUser"), "InstitutionalAffiliation"])) ,"\nunique institutions"), gp=gpar(fontsize=8, fontface = "bold")),xmin=5.5,xmax=5.5,ymin=3,ymax=3) +
+ coord_cartesian(clip = "off")
+
+ggsave(here("plots/institutionalAffilition_currentUserResponses.png"))
+```
+
+Taking a less granular approach, and just looking at institution type rather than names of institutions. Saving the plot into a variable so that we can combine it with the one for potential users later.
+
+Note that the x- and y-axis labels are turned off since this will be the top plot when combined, also simplified the title to just say Current Users. Turned off the legend.
+
+Also used `scale_fill_manual` to set specific colors for the institution types in order to sync colors for institution types in this and the potential users version (`institutionTypePotential`) (more info on this with that plot below).
+
+```{r}
+institutionTypeCurrent <- resultsTidy %>%
+ filter(UserType == "CurrentUser") %>%
+ group_by(InstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = reorder(InstitutionalType, InstitutionalCount),
+ x = InstitutionalCount,
+ fill = InstitutionalType
+ )) + geom_bar(stat = "identity") +
+ theme_bw() +
+ theme(
+ panel.background = element_blank(),
+ panel.grid.minor = element_blank(),
+ panel.grid.major.y = element_blank()
+ ) +
+ ylab("") +
+ xlab("Count") +
+ #xlab("") +
+ ggtitle(bquote(bold("Current Users"))) +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("R1 University" = "#FDB462",
+ "Research Center" = "#FCCDE5",
+ "Medical Center or School" = "#FB8072",
+ "R2 University" = "#B3DE69")) +
+ theme(legend.position = "none")
+
+institutionTypeCurrent
+
+#ggsave(here("plots/institutionalType_currentUserResponses.png"), plot = institutionTypeCurrent)
+```
+
+#### Just for Potential Users
+
+Here we want to focus on institutions represented by just potential users of AnVIL.
+
+Description of variable definitions and steps
+
+We first select rows/responses that are just from potential users. Then we prepare the data and plot following the same scheme as above.
+
+
+
+```{r}
+resultsTidy %>%
+ filter(UserType == "PotentialUser") %>%
+ group_by(InstitutionalAffiliation, InstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = reorder(InstitutionalAffiliation, InstitutionalCount),
+ x = InstitutionalCount,
+ fill = InstitutionalType
+ )) + geom_bar(stat = "identity") +
+ theme_bw() +
+ theme(
+ panel.background = element_blank(),
+ panel.grid.minor = element_blank(),
+ panel.grid.major.y = element_blank()
+ ) +
+ ylab("Institutional Affiliation") + xlab("Count") +
+ ggtitle(bquote('Institutional Affilition for' ~ bold('Potential User') ~ 'Respondents')) +
+ annotation_custom(textGrob(paste("There are\n", nrow(unique(resultsTidy[which(resultsTidy$UserType == "PotentialUser"), "InstitutionalAffiliation"])) ,"\nunique institutions"), gp=gpar(fontsize=8, fontface = "bold")),xmin=6,xmax=6,ymin=1.5,ymax=1.5) +
+ coord_cartesian(clip = "off")
+
+
+ggsave(here("plots/institutionalAffilition_potentialUserResponses.png"))
+```
+
+Taking a less granular approach, and just looking at institution type rather than names of institutions.
+
+Wanted to sync the colors between the current and potential institutional types and so used the Set3 palette for scale_fill_brewer as it has 12 colors (and need 9 for current users) and it seemed more accessible than the Paired palette. To see the hex codes that were assigned to the shared institution types in this plot, I used the `scales` library and `brewer_pal(palette = "Set3")(9)`
+
+Turned off the y-axis label, but kept the x-axis label since this will be the bottom plot when combined with the current user version (`institutionTypeCurrent`). Also used `xlim` to sync the x-axis limits between the two.
+
+Simplified the title to just be Potential Users. Turned off the legend.
+
+```{r}
+institutionTypePotential <- resultsTidy %>%
+ filter(UserType == "PotentialUser") %>%
+ group_by(InstitutionalType) %>% summarize(InstitutionalCount = n()) %>%
+ ggplot(aes(
+ y = reorder(InstitutionalType, InstitutionalCount),
+ x = InstitutionalCount,
+ fill = InstitutionalType
+ )) + geom_bar(stat = "identity") +
+ theme_bw() +
+ theme(
+ panel.background = element_blank(),
+ panel.grid.minor = element_blank(),
+ panel.grid.major.y = element_blank()
+ ) +
+ ylab("") +
+ xlab("") +
+ #xlab("Count") +
+ xlim(0,15) +
+ ggtitle(bquote(bold("Potential Users"))) +
+ coord_cartesian(clip = "off") +
+ scale_fill_brewer(palette = "Set3") +
+ theme(legend.position = "none")
+
+institutionTypePotential
+
+#ggsave(here("plots/institutionalType_potentialUserResponses.png"), plot = institutionTypePotential)
+```
+Combined the two plots for institutional type (`institutionTypeCurrent` and `institutionTypePotential`) using patchwork, stacking them on top of each other (`/`) and using `plot_layout` to set the heights since there are more institution types for Potential users than Current users and therefore want current users to be shorter than default.
+
+
+```{r}
+combined_plot <- institutionTypePotential / institutionTypeCurrent + plot_layout(heights = unit(c(4, 2),'cm')) + plot_annotation("What institution are you affiliated with?")
+
+combined_plot
+
+ggsave(here("plots/institutionalType_facetedUserType.png"), plot = combined_plot)
+```
+
+## Highest Degree attained (or in progress)
+
+Description of variable definitions and steps
+
+First we select the columns of interest from `resultsTidy`: `Degrees` and `UserType`. Then we use `group_by` in conjunction with `summarize( = n())` to add counts for how many of each combo are observed in the data.
+
+Then we send this data to ggplot and make a bar chart with the x-axis representing the degrees (`reorder`ed by the count number such that higher counts are first (and the sum) because otherwise the 2 MDs are located after the high school and master's in progress bars (1 each)). The y-axis represents the count, and the fill is used to specify user type (current or potential AnVIL users). We use a stacked bar chart and include labels above each bar of the total sum for that degree type.
+
+Used [this Stack Overflow post to label sums above the bars](https://stackoverflow.com/questions/30656846/draw-the-sum-value-above-the-stacked-bar-in-ggplot2)
+
+and used [this Stack Overflow post to remove NA from the legend](https://stackoverflow.com/questions/45493163/ggplot-remove-na-factor-level-in-legend)
+
+The rest of the changes are related to theme and labels and making sure that the numerical bar labels aren't cut off on the top.
+
+
+
+```{r}
+
+resultsTidy %>%
+ mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser"))) %>%
+ group_by(Degrees, UserType) %>%
+ summarize(n = n()) %>%
+ ggplot(aes(x = reorder(Degrees, -n, sum),
+ y = n,
+ fill = UserType
+ )) +
+ geom_bar(position = "stack", stat="identity") +
+ geom_text(
+ aes(label = after_stat(y), group = Degrees),
+ stat = 'summary', fun = sum, vjust = -1, size=2
+ ) +
+ theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
+ xlab("Degree") +
+ ylab("Count") +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("#E0DD10", "#035C94"), na.translate = F) +
+ ggtitle("What is the highest degree you have attained?")
+
+ggsave(here("plots/degree_usertype.png"))
+```
+
+## Human Genomic, Non-human Genomic, and Human Clinical Research Experience
+
+Question and possible answers
+
+>How much experience do you have analyzing the following data categories?
+
+The data categories were
+
+* Human genomic
+* Non-human genomic
+* Human clinical
+
+and for each category, possible options were
+
+* Not at all experienced
+* Slightly experienced
+* Somewhat experienced
+* Moderately experienced
+* Extremely experienced
+
+
+
+### Prepare data
+
+Description of variable definitions and steps
+
+Here we select the columns containing answers for each data category: `HumanGenomicExperience`, `HumanClinicalExperience`, and `NonHumanGenomicExperience`. We also select `UserType` in case we want to split user type out at all in viewing the data. We use a `pivot_longer` to make a long dataframe that can be grouped and groups counted. The category/column names go to a new column, `researchType` and the values in those columns go to a new column `experienceLevel`. Before we use group by and count, we set the factor level on the new `experienceLevel` column to match the progression from not at all experienced to extremely experienced, and we rename the research categories so that the words have spaces, and we say research instead of experience. Then we use `group_by` and `summarize` to add counts for each combination of research category, experience level, and UserType. These counts are in the new `n` column.
+
+
+
+```{r}
+experienceDf <- resultsTidy %>% select(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience, UserType) %>%
+ pivot_longer(c(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience), names_to = "researchType", values_to = "experienceLevel") %>%
+ mutate(experienceLevel =
+ factor(experienceLevel, levels = c("Not at all experienced", "Slightly experienced", "Somewhat experienced", "Moderately experienced", "Extremely experienced")),
+ researchType = case_when(researchType == "HumanClinicalExperience" ~ "Human Clinical Research",
+ researchType == "HumanGenomicExperience" ~ "Human Genomic Research",
+ researchType == "NonHumanGenomicExperience" ~ "Non-human\nGenomic Research")) %>%
+ group_by(researchType, experienceLevel, UserType) %>% summarize(n = n())
+```
+
+### Plot data
+
+Should we split current users vs potential users?
+
+Here we use two different plots to show that the distribution of experience level among these three research types is similar when comparing the distribution of current users vs potential users. In this first plot, we have the experience level on the x-axis, the count on the y-axis, and color the bars by research type. We stack the user type responses using `facet_wrap` and `nrow=2` as an argument within that. We use a `position="dodge"` to cluster the similar research type bars next to each other. And we use geom_text to label the bars with the actual count. This requires `group = researchType` within the `geom_text()` `aes()` and `position = position_dodge(width = 0.9)` within the general `geom_text()` function.
+
+We then also make some theme changes like rotating the x-axis tick labels and changing the y- and x- axis labels and using a minimal theme to turn off borders, and then turning off grids, etc.
+
+```{r}
+ggplot(experienceDf, aes(x=experienceLevel,y=n, fill=researchType)) +
+ facet_wrap(~UserType, nrow=2) +
+ geom_bar(stat="identity", position="dodge") +
+ theme_minimal() +
+ theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle = 45, hjust=1)) +
+ geom_text(
+ aes(label = n, group = researchType),
+ size=2, position = position_dodge(width = .9), vjust=-0.5
+) +
+ ylab("Count") + xlab ("Reported Experience Level") +
+ coord_cartesian(clip = "off")
+
+
+ggsave(here("plots/researchExperienceLevel_colorResearchType.png"))
+```
+
+In this second plot, we have the experience level on the x-axis, the count on the y-axis, and color the bars by experience level. We stack the user type responses and separate out the research types into separate facets using `facet_grid`. And we use geom_text to label the bars with the actual count. This uses `group = experienceLevel` within the `geom_text()` `aes()`.
+
+We then also make some theme changes like rotating the x-axis tick labels and changing the y- and x- axis labels, expanding the left plot margin, and using a minimal theme to turn off borders, and then turning off grids, etc.
+
+```{r}
+ggplot(experienceDf, aes(x=experienceLevel,y=n, fill=experienceLevel)) +
+ facet_grid(UserType~researchType) +
+ geom_bar(stat="identity") +
+ theme_classic() +
+ theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle = 45, hjust=1)) +
+ geom_text(
+ aes(label = n, group = experienceLevel), vjust = -1, size=2
+) +
+ ylab("Count") + xlab ("Reported Experience Level") +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.05, "cm")) +
+ theme(legend.position = "none")
+
+
+ggsave(here("plots/researchExperienceLevel_colorExperience.png"))
+```
+
+We include both plots in case since we don't know which we like better, but both of these give us confidence that current and potential user counts for reported experience level in these research areas show similar distributions. So we'll go ahead and plot it without splitting out UserType.
+
+
+
+#### Preferred bar plot
+
+Description of variable definitions and steps
+
+This bar plot has the experience level on the x-axis, the count on the y-axis, and fills the bars according to the experience level (though the fill/color legend is turned off by setting legend.position to none). We facet the research category type and label the bars. We keep a summary stat and sum function and after_stat(y) for the label since the data has splits like UserType that we're not visualizing here.
+
+We adjust various aspects of the theme like turning off the grid and background and rotating the x-tick labels and changing the x- and y-axis labels. We also slightly widen the left axis so that the tick labels aren't cut off.
+
+
+
+```{r}
+ggplot(experienceDf, aes(x=experienceLevel,y=n, fill=experienceLevel)) +
+ facet_grid(~researchType) +
+ geom_bar(stat="identity") +
+ theme_bw() +
+ theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle = 45, hjust=1)) +
+ geom_text(
+ aes(label = after_stat(y), group = experienceLevel),
+ stat = 'summary', fun = sum, vjust = -0.5, size=2
+) +
+ ylab("Count") + xlab ("Reported Experience Level") +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.05, "cm")) +
+ theme(legend.position = "none") +
+ ggtitle("How much experience do you have analyzing the following data categories?")
+
+
+ggsave(here("plots/researchExperienceLevel_colorExperienceLevel_noUserTypeSplit.png"))
+```
+
+This is the same plot, but with no color fill on the bars in order to remove redundancy.
+
+```{r}
+ggplot(experienceDf, aes(x=experienceLevel,y=n)) +
+ facet_grid(~researchType) +
+ geom_bar(stat="identity") +
+ theme_bw() +
+ theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle = 45, hjust=1)) +
+ geom_text(
+ aes(label = after_stat(y), group = experienceLevel),
+ stat = 'summary', fun = sum, vjust = -0.5, size=2
+) +
+ ylab("Count") + xlab ("Reported Experience Level") +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.05, "cm")) +
+ theme(legend.position = "none")+
+ ggtitle("How much experience do you have analyzing the following data categories?")
+
+
+ggsave(here("plots/researchExperienceLevel_noColor_noUserTypeSplit.png"))
+```
+
+
+# Insights
+
+## Comparisons of rank of importance of features/resources between Current Users and Potential Users
+
+>Rank the following features or resources according to their importance for your continued use of the AnVIL
+
+>Rank the following features or resources according to their importance to you as a potential user of the AnVIL?
+
+* Easy billing setup
+* Flat-rate billing rather than use-based
+* Free version with limited compute or storage
+* On demand support and documentation
+* Specific tools or datasets are available/supported
+* Greater adoption of the AnVIL by the scientific community
+
+We're going to look at a comparison of the assigned ranks for these features, comparing between current users and potential users.
+
+### Recode rank values
+
+Description of variable definitions and steps
+
+Columns of interest include
+
+* PotentialRankEasyBillingSetup
+* PotentialRankFlatRateBilling
+* PotentialRankFreeVersion
+* PotentialRankSupportDocs
+* PotentialRankToolsData
+* PotentialRankCommunityAdoption
+* CurrentRankEasyBillingSetup
+* CurrentRankFlatRateBilling
+* CurrentRankFreeVersion
+* CurrentRankSupportDocs
+* CurrentRankToolsData
+* CurrentRankCommunityAdoption
+
+We can use `starts_with` to select these columns, specifically focusing on the starts with "PotentialRank" and "CurrentRank". When we made simplified names for the columns, these are the only twelve that start like that.
+
+Either the 6 CurrentRank or the 6 PotentialRank were asked to each survey taker which means that we expect NULL values in these columns since not every survey taker will have answered all of these questions.
+
+We want to recode the following values
+
+* Replace 1 (Most important in this list) with 1
+* Replace 6 (Least important in this list) with 6
+
+Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined.
+
+After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_`
+
+
+
+```{r}
+resultsTidy %<>%
+ mutate(across(starts_with(c(
+ "PotentialRank", "CurrentRank"
+ )), as.character)) %>%
+ unnest(starts_with(c("PotentialRank", "CurrentRank")), keep_empty = TRUE) %>%
+ mutate(across(
+ starts_with(c("PotentialRank", "CurrentRank")),
+ ~ recode(
+ .x,
+ "1 (Most important in this list)" = "1",
+ "6 (Least important in this list)" = "6",
+ "NULL" = NA_character_
+ )
+ )) %>%
+ mutate(across(starts_with(c(
+ "PotentialRank", "CurrentRank"
+ )), as.integer))
+```
+
+### Numerical response bias
+
+Visualizing the numerical response bias since there were non-unique ranks assigned by some respondents
+
+```{r}
+resultsTidy %>%
+ select(starts_with("PotentialRank")) %>%
+ rowSums(na.rm = TRUE) %>%
+ table() %>% as.data.frame()
+```
+
+We would expect a row sum of 21 if a 6, 5, 4, 3, 2, and 1 were selected. We see row sums ranging from 6 (ranking everything 1) to 24. Only 8 out of 28 responses have a row sum of 21 and even that doesn't guarantee that all choices received a unique ranking for those 8 responses (e.g., 3 2's, 1 4, 1 5 and 1 6 sum to 21). So this table is instead showing that 20 responses definitely did not use unique ranks for all 6 questions. Given that most of these observed sums are less than 21, people showed a bias towards ranking things as more important (closer to 1)
+
+```{r}
+resultsTidy %>%
+ select(starts_with("CurrentRank")) %>%
+ rowSums(na.rm = TRUE) %>%
+ table() %>% as.data.frame()
+```
+
+We again would expect a row sum of 21 if a 6, 5, 4, 3, 2, and 1 were selected. We see row sums ranging from 6 (ranking everything 1) to 26. Only 9 out of 22 responses have a row sum of 21 and even that doesn't guarantee that all choices received a unique ranking for those 9 responses (e.g., 3 2's, 1 4, 1 5 and 1 6 sum to 21). So this table is instead showing that 13 responses definitely did not use unique ranks for all 6 questions. Given that most of these observed sums are less than 21, people showed a bias towards ranking things as more important (closer to 1)
+
+We can visualize the numerical response bias where people tended to rate things as more important by creating a density plot of all rankings no matter the feature queried or
+
+```{r}
+resultsTidy %>%
+ select(starts_with(c("CurrentRank", "PotentialRank"))) %>%
+ pivot_longer(cols = everything()) %>%
+ drop_na() %>%
+ ggplot(aes(x = value)) +
+ geom_density() +
+ theme_bw() + theme(panel.background = element_blank()) +
+ xlab("Rank") + scale_x_continuous(breaks = 1:6, labels = 1:6)
+```
+
+
+
+### Plot dumbbell plot
+
+#### Prepare data
+
+Average rank is total rank (sum of given ranks) divided by number of votes (number of given ranks)
+
+Description of variable definitions and steps
+
+We make two different dataframes that find the total ranks (column name: `totalRank`) and avg ranks (column name: `avgRank`) for each future and then row bind (`bind_rows`) these two dataframes together to make `totalRanksdf`. The reason that we make two separately are that one is for Potential users (`starts_with("PotentialRank")`) and one is for Current users (`starts_with("CurrentRank")`). They have a different number of votes `nranks` and so it made more sense to work with them separately, following the same steps and then row bind them together.
+
+The individual steps for each of these dataframes is to
+
+* `select` the relevant columns from `resultsTidy`
+* perform sums with `colSums`, adding together the ranks in those columns (each column corresponds to a queried feature); We set `na.rm = TRUE` to ignore the NAs (since not every survey respondent was asked each question; e.g., if they were a current user they weren't asked as a potential user)
+* send those sums to a data frame such that the selected column names from the first step are now the row names and the total summed rank is the only column with values in each row corresponding to each queried feature
+* Use a `mutate` to
+ * add a new column `nranks` that finds the number of responses in the survey are from potential users (e.g., the number that would have assigned ranks to the PotentialRank questions) or the number of responses in the survey that are from current/returning users (e.g., the number that would have assigned ranks to the CurrentRank questions).
+ * add a new column `avgRank` that divides the `totalRank` by the `nranks`
+
+After these two dataframes are bound together (`bind_rows`), the rest of the steps are for aesthetics in plotting and making sure ggplot knows the UserType and the feature of interest, etc.
+
+* We move the rownames to their own column `UsertypeFeature` (with the `mutate(UsertypeFeature = rownames(.))`).
+* We separate the values in that column on the word "Rank" to remove the `UsertypeFeature` column we just made but then make two new columns (`Usertype` and `Feature`) where `Usertype is either "Current" or "Potential", and the Features are listed in the code below, because...
+* We then use a `case_when` within a `mutate()` to fill out those features so they're more informative and show the choices survey respondents were given.
+
+
+
+```{r}
+totalRanksdf <-
+ bind_rows(
+ resultsTidy %>%
+ select(starts_with("PotentialRank")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "PotentialUser"),
+ avgRank = totalRank / nranks),
+ resultsTidy %>%
+ select(starts_with("CurrentRank")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "CurrentUser"),
+ avgRank = totalRank /nranks)
+ ) %>%
+ mutate(UsertypeFeature = rownames(.)) %>%
+ separate(UsertypeFeature, c("Usertype", "Feature"), sep = "Rank", remove = TRUE) %>%
+ mutate(Feature =
+ case_when(Feature == "EasyBillingSetup" ~ "Easy billing setup",
+ Feature == "FlatRateBilling" ~ "Flat-rate billing rather than use-based",
+ Feature == "FreeVersion" ~ "Free version with limited compute or storage",
+ Feature == "SupportDocs" ~ "On demand support and documentation",
+ Feature == "ToolsData" ~ "Specific tools or datasets are available/supported",
+ Feature == "CommunityAdoption" ~ "Greater adoption of the AnVIL by the scientific community"),
+ Usertype = factor(case_when(Usertype == "Potential" ~ "Potential Users",
+ Usertype == "Current" ~ "Current Users"), levels = c("Potential Users", "Current Users"))
+ )
+```
+
+#### Dumbbell plot
+
+Description of variable definitions and steps
+
+We use the `totalRanksdf` we just made. The x-axis is the `avgRank` values, and the y-axis displays the informative `Feature` values, however, we `reorder` the y-axis so that more important (lower number) avgRank features are displayed higher in the plot.
+
+geom_point and geom_line are used in conjunction to produce the dumbbell look of the plot and we set the color of the points to correspond to the `Usertype`
+
+Some theme things are changed, labels and titles added, and then we display and save that plot.
+
+The first version of the plot has trimmed limits, so the second version sets limits on the x-axis of 1 to 6 since those were the options survey respondents were given for ranking. It also adds annotations (using [Grobs, explained in this Stack Overflow post answer](https://stackoverflow.com/a/31081162)) to specify which rank was "Most important" and which was "Least important".
+
+Then we've also adjusted the left margin so that the annotation isn't cut off.
+
+We then display and save that version as well.
+
+
+
+```{r}
+gdumbbell <- ggplot(totalRanksdf, aes(x = avgRank, y = reorder(Feature, -avgRank))) +
+ geom_line() +
+ geom_point(aes(color = Usertype), size = 3) +
+ theme(panel.background = element_blank()) + theme_bw() + theme(legend.position = "bottom") +
+ xlab("Average Rank") +
+ ylab("Feature") +
+ ggtitle("Rank the following features according to\ntheir importance to you as a potential user\nor for your continued use of the AnVIL") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ theme(legend.title = element_blank())
+
+gdumbbell
+
+ggsave(here("plots/dumbbellplot_rankfeatures.png"), plot = gdumbbell)
+
+gdumbbell <- gdumbbell +
+ scale_x_continuous(breaks = 1:6, labels = 1:6, limits = c(1,6))+
+ annotation_custom(textGrob("Most\nimportant", gp=gpar(fontsize=8, fontface = "bold")),xmin=1,xmax=1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\nimportant", gp=gpar(fontsize=8, fontface= "bold")),xmin=6,xmax=6,ymin=-0.5,ymax=-0.5) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm")) +
+ scale_color_manual(values = c("#E0DD10", "#035C94"))
+
+gdumbbell
+
+ggsave(here("plots/dumbbellplot_xlim16_rankfeatures.png"), plot = gdumbbell)
+
+gdumbbell <- gdumbbell +
+ scale_x_reverse(limits = c(6,1), breaks = 6:1, labels = 6:1) +
+ annotation_custom(textGrob("Most\nimportant", gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\nimportant", gp=gpar(fontsize=8, fontface= "bold")),xmin=-6,xmax=-6,ymin=-0.5,ymax=-0.5)
+
+gdumbbell
+
+ggsave(here("plots/dumbbellplot_xlim16_revaxis_rankfeatures.png"), plot = gdumbbell)
+
+```
+
+### Plot Density plot
+
+#### Prepare data
+
+Description of variable definitions and steps
+
+Here, we just want all of the numerical ranks in one column and we can have additional columns that describe if that rank was from a current or potential user and which feature it corresponds to.
+
+So to make a dataframe `densitydf`, we
+
+* start by selecting the columns of interest from `resultsTidy` using `select(starts_with(c("PotentialRank", "CurrentRank")))
+* tell it to take this "wide" dataframe and pivot it to a longer one where the values all go to a `value` column, and the column name associated with the value goes into a `name` column.
+* drop rows that have na with `drop_na()` since as described earlier not every survey respondent was asked each question; e.g., if they were a current user they weren't asked as a potential user.
+* Then we `separate` the `name` column on the word "Rank" to remove the `name` column we just made but then make two new columns (`Usertype` and `Feature`) where `Usertype is either "Current" or "Potential", and the Features are listed in the code below, because...
+* We then use a `case_when` within a `mutate()` to fill out those features so they're more informative and show the choices survey respondents were given.
+* we add another `case_when` within that `mutate` to add the word "Users" to the `Usertypes` column values.
+
+
+
+```{r}
+densitydf <- resultsTidy %>%
+ select(starts_with(c("PotentialRank", "CurrentRank"))) %>% pivot_longer(cols = everything()) %>% drop_na() %>%
+ separate(name, c("Usertype", "Feature"), sep = "Rank", remove = TRUE) %>%
+ mutate(Feature =
+ case_when(Feature == "EasyBillingSetup" ~ "Easy billing setup",
+ Feature == "FlatRateBilling" ~ "Flat-rate billing rather than use-based",
+ Feature == "FreeVersion" ~ "Free version with limited compute or storage",
+ Feature == "SupportDocs" ~ "On demand support and documentation",
+ Feature == "ToolsData" ~ "Specific tools or datasets are available/supported",
+ Feature == "CommunityAdoption" ~ "Greater adoption of the AnVIL by the scientific community"),
+ Usertype =
+ case_when(Usertype == "Current" ~ "Current Users",
+ Usertype == "Potential" ~ "Potential Users")
+ )
+```
+
+
+#### Density plot
+
+Description of variable definitions and steps
+
+We use the `densitydf` dataframe we just made and the x-axis is raw rank `value` column values, and the y-axis shows the density. The different density curves are grouped and color filled based off of which feature they represent, and we `facet_wrap` or split the plot facets into two rows so that there's one for each user type. We set the alpha value within `geom_density` since so many of the curves are on top of each other.
+
+Some theme things are changed, labels and titles added, and then we display and save that plot
+
+It also adds annotations (using [Grobs, explained in this Stack Overflow post answer](https://stackoverflow.com/a/31081162)) to specify which rank was "Most important" and which was "Least important".
+
+And it increases the bottom margin so those grob annotations aren't cutoff
+
+
+
+```{r}
+ggplot(densitydf, aes(x=value, group = Feature, fill = Feature)) +
+ facet_wrap(~Usertype, nrow = 2) +
+ geom_density(alpha=0.3) +
+ theme_bw() + theme(panel.background = element_blank()) +
+ xlab("Rank") + scale_x_continuous(breaks = 1:6, labels= 1:6, limits = c(1, 6)) +
+ ggtitle("Rank the following features according to\ntheir importance to you as a potential user\nor for your continued use of the AnVIL")+
+ annotation_custom(textGrob("Most\nimportant", gp=gpar(fontsize=8, fontface = "bold")),xmin=1,xmax=1,ymin=-0.85,ymax=-0.85) +
+ annotation_custom(textGrob("Least\nimportant", gp=gpar(fontsize=8, fontface= "bold")),xmin=6,xmax=6,ymin=-0.85,ymax=-0.85) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1.25,1, "cm"))
+ggsave(here("plots/densityplot_rankfeatures.png"))
+```
+
+
+#### Density plot with facets for feature
+
+Description of variable definitions and steps
+
+We use the `densitydf` dataframe we just made, but we re-simplify the Features so that they'll fit in the legend. For the plot, the x-axis is raw rank `value` column values, and the y-axis shows the density. The different density curves are grouped and color filled based off of which feature they represent, and we use `facet_grid` to split the plot facets into two rows and 6 columns so that there's one row for each user type and one column per feature. We switch the row/y-axis labels over to the left (using `switch = "y"`) and remove the column/x-axis labels (using `theme(strip.background.x = element_blank(), strip.text.x = element_blank())`)
+
+We use the `unit()` function to create some margins, and then set the plot margins and legend position within another `theme()`. I used [this Stack Overflow post to find this method.](https://stackoverflow.com/questions/29808620/ggplot2-move-legend-to-corner-but-keep-it-in-margin)
+
+Some theme things are changed, labels and titles added, and then we display and save that plot
+
+
+
+```{r}
+margins = unit(c(1, 10, 1, 1), 'lines')
+
+densitydf %>%
+ mutate(Feature =
+ case_when(Feature == "Easy billing setup" ~ "Easy billing setup",
+ Feature == "Flat-rate billing rather than use-based" ~ "Flat-rate billing",
+ Feature == "Free version with limited compute or storage" ~ "Free version",
+ Feature == "On demand support and documentation" ~ "Support & documentation",
+ Feature == "Specific tools or datasets are available/supported" ~ "Specific tools or datasets",
+ Feature == "Greater adoption of the AnVIL by the scientific community" ~ "More community adoption")
+ ) %>%
+ ggplot(aes(x=value, group = Feature, fill = Feature)) +
+ facet_grid(Usertype~Feature, switch = "y") +
+ geom_density() +
+ theme_bw() + theme(panel.background = element_blank()) + #theme(legend.position = "bottom") +
+ theme(strip.background.x = element_blank(), strip.text.x = element_blank()) +
+ theme(plot.margin=margins, legend.position=c(1.25, 0.5)) +
+ xlab("Rank") + scale_x_continuous(breaks = 1:6, labels= 1:6, limits = c(1, 6)) +
+ ggtitle("Rank the following features according to their importance to you as a\npotential user or for your continued use of the AnVIL")+
+ coord_cartesian(clip = "off")
+
+ggsave(here("plots/densityplot_rankfeatures_faceted.png"))
+```
+
+### Plot Stacked Bar Chart showing number of times for each rank rather than average
+
+#### Prepare data (count)
+
+Description of variable definitions and steps
+
+For this, we want a data frame that gives counts for all of the ranks given to each feature by each UserType.
+
+To do this we
+
+ * Select the relevant columns from `resultsTidy`, specifically using `select(starts_with(c("PotentialRank", "CurrentRank")))`
+ * tell it to take this "wide" dataframe and pivot it to a longer one (`pivot_longer`) where the values all go to a `value` column, and the column name associated with the value goes into a `name` column.
+ * drop rows that have na with `drop_na()` since as described earlier not every survey respondent was asked each question; e.g., if they were a current user they weren't asked as a potential user.
+ * group by the name (feature and UserType combined) and value (the rank) and have it count the number of that specific rank for each feature/UserType combo
+ * rename the columns because it's getting confusing. name stays name, value changes to rank, and n is used for the count.
+ * Then we `separate` the `name` column on the word "Rank" to remove the `name` column but then make two new columns (`Usertype` and `Feature`) where `Usertype is either "Current" or "Potential", and the Features are listed in the code below, because...
+ * We then use a `case_when` within a `mutate()` to fill out those features so they're more informative and show the choices survey respondents were given.
+ * we add another `case_when` within that `mutate` to add the word "Users" to the `Usertypes` column values.
+ * set the ranks to be a factor (treated like a categorical variable with a better color scheme instead of a continuous one if we didn't do this) with a specified level so that the most important rank is the first bar on the left when we plot.
+
+
+
+```{r}
+countdf <- resultsTidy %>%
+ select(starts_with(c("PotentialRank", "CurrentRank"))) %>%
+ pivot_longer(cols = everything()) %>%
+ drop_na() %>%
+ group_by(name, value) %>% count() %>%
+ `colnames<-`(c("name", "rank", "n")) %>%
+ separate(name, c("Usertype", "Feature"), sep = "Rank", remove = TRUE) %>%
+ mutate(Feature =
+ case_when(Feature == "EasyBillingSetup" ~ "Easy billing setup",
+ Feature == "FlatRateBilling" ~ "Flat-rate billing rather than use-based",
+ Feature == "FreeVersion" ~ "Free version with limited compute or storage",
+ Feature == "SupportDocs" ~ "On demand support and documentation",
+ Feature == "ToolsData" ~ "Specific tools or datasets are available/supported",
+ Feature == "CommunityAdoption" ~ "Greater adoption of the AnVIL by the scientific community"),
+ Usertype =
+ case_when(Usertype == "Current" ~ "Current Users",
+ Usertype == "Potential" ~ "Potential Users"),
+ rank = factor(rank, levels = c(6:1))
+ )
+```
+
+#### Stacked bar chart
+
+Description of variable definitions and steps
+
+Using the `countdf` dataframe that we just made, we have the count or `n` column on the x-axis, the `Feature` on the y-axis, and the fill of the bars to be the `rank` (categorical 1, 2, 3, 4, 5, 6). We facet wrap on UserType with two rows so that each facet represents a different UserType.
+
+We use the `position = "fill"` argument in `geom_bar()` so that it's a percent stacked bar instead of raw counts (since current and potential users had a different number of respondents)
+
+We set the labels for the legend so that it specifies which rank is Least important and which is most important, and we reverse the order in the legend so 1 is on top on the legend.
+
+Finally we set labels and titles and change the theme a bit
+
+
+
+```{r}
+ggplot(countdf, aes(fill=rank, y=Feature, x=n)) +
+ facet_wrap(~Usertype, nrow=2) +
+ geom_bar(position="fill", stat="identity") +
+ scale_fill_discrete(labels=c('6 (Least\n Important)', '5', '4', '3', '2', '1 (Most\n Important)')) +
+ guides(fill = guide_legend(reverse = TRUE)) +
+ xlab("Percent Responses") +
+ ggtitle("Rank the following features according to\ntheir importance to you as a potential user\nor for your continued use of the AnVIL") +
+ theme_bw() + theme(panel.background = element_blank(), panel.grid = element_blank())
+
+ggsave(here("plots/stackedbarplot_rankfeatures.png"))
+```
+
+## Tool Knowledge and Comfort Separate from the AnVIL and on the AnVIL
+
+>How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)?
+
+>How would you rate your knowledge of or comfort with these technologies (on the AnVIL)?
+
+>How would you rate your knowledge of or comfort with these AnVIL data features?
+
+Shared technologies between these two questions include
+
+* Jupyter Notebooks: `CurrentAnVILTechJupyterNotebooks` & `AllTechJupyterNotebooks`
+* Bioconductor & RStudio: `CurrentAnVILTechRStudio` & `AllTechRStudio` + `AllTechBioconductor`
+* Galaxy: `CurrentAnVILTechGalaxy` & `AllTechGalaxy`
+* WDL Workflows / Workflows (e.g., WDL): `CurrentAnVILTechWDL` & `AllTechWorkflows`
+* Containers: `CurrentAnVILTechContainers` & `AllTechContainers`
+* Unix / Command Line: `CurrentAnVILTechCommandLine` & `AllTechCommandLine`
+
+Technologies only asked separate from the AnVIL
+
+* Python: `AllTechPython`
+* R: `AllTechR`
+
+Technologies/data features only asked with regards to the AnVIL
+
+* Accessing controlled access datasets: `CurrentAnVILTechAccessData`
+* DUOS (Data Use Oversight System): `CurrentAnVILTechDUOS`
+* Terra on AnVIL (Workspaces): `CurrentAnVILTechTerra`
+* TDR (Terra Data Repository): `CurrentAnVILTechTDR`
+
+Possible answers for each of these questions include
+
+* Don't know it (0)
+* Not at all comfortable (1)
+* Slightly comfortable (2)
+* Somewhat comfortable (3)
+* Moderately comfortable (4)
+* Extremely comfortable (5)
+
+Notated possible "comfort scores" in parentheses next to each possible answer. We'll add these as additional columns that now start with the word "Score_" but otherwise retain the column name, in case it's helpful to still have the words (whose factor level we'll set to reflect the progression of knowledge/comfort).
+
+Responses are NA if the question wasn't asked to the survey taker (e.g., they were a potential user and weren't asked about technologies with regards to the AnVIL)
+
+It's likely that someone who's a program administrator will select don't know for these.... should we remove them and see how average scores change?
+
+```{r}
+resultsTidy %<>%
+ mutate(across(starts_with(c(
+ "CurrentAnVILTech", "AllTech"
+ )), as.character)) %>%
+ unnest(starts_with(c("CurrentAnVILTech", "AllTech")), keep_empty = TRUE) %>%
+ mutate(across(starts_with(c(
+ "CurrentAnVILTech", "AllTech"
+ )), ~ parse_factor(
+ .,
+ levels = c(
+ "Don't know it",
+ "Not at all comfortable",
+ "Slightly comfortable",
+ "Somewhat comfortable",
+ "Moderately comfortable",
+ "Extremely comfortable"
+ )
+ ))) %>%
+ mutate(across(
+ starts_with(c("CurrentAnVILTech", "ALLTech")),
+ ~ case_when(
+ . == "Don't know it" ~ 0,
+ . == "Not at all comfortable" ~ 1,
+ . == "Slightly comfortable" ~ 2,
+ . == "Somewhat comfortable" ~ 3,
+ . == "Moderately comfortable" ~ 4,
+ . == "Extremely comfortable" ~ 5
+ )
+ ,
+ .names = "Score_{.col}"
+ ))
+```
+
+### Dumbbell like plot
+
+#### Prepare the data
+
+```{r}
+toPlot <- bind_rows(
+ resultsTidy %>%
+ filter(UserType == "CurrentUser") %>%
+ select(starts_with("Score_")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalScore")) %>%
+ mutate(nscores = sum(resultsTidy$UserType == "CurrentUser"),
+ avgScore = totalScore / nscores,
+ UserType = "Current Users") %>%
+ mutate(WhereTool = rownames(.)) %>%
+ separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>%
+ mutate(AnVILorNo =
+ case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL",
+ AnVILorNo == "Score_All" ~ "Separate from the AnVIL"
+ ),
+ Tool =
+ recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks",
+ "WDL" = "Workflows",
+ "CommandLine" = "Unix / Command Line",
+ "AccessData" = "Access controlled access data",
+ "Terra" = "Terra Workspaces",
+ "BioconductorRStudio" = "Bioconductor & RStudio"
+ )
+ ),
+
+ resultsTidy %>%
+ filter(UserType == "PotentialUser") %>%
+ select(starts_with("Score_AllTech")) %>%
+ colSums() %>%
+ as.data.frame() %>% `colnames<-`(c("totalScore")) %>%
+ mutate(nscores = sum(resultsTidy$UserType == "PotentialUser"),
+ avgScore = totalScore / nscores,
+ UserType = "Potential Users") %>%
+ mutate(WhereTool = rownames(.)) %>%
+ separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>%
+ mutate(AnVILorNo =
+ case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL",
+ AnVILorNo == "Score_All" ~ "Separate from the AnVIL"
+ ),
+ Tool =
+ recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks",
+ "WDL" = "Workflows",
+ "CommandLine" = "Unix / Command Line",
+ "AccessData" = "Access controlled access data",
+ "Terra" = "Terra Workspaces",
+ "BioconductorRStudio" = "Bioconductor & RStudio"
+ )
+ )
+) %>%
+ mutate(UserType = factor(UserType, levels = c("Potential Users", "Current Users")))
+```
+
+```{r}
+toPlot_simplified <- toPlot %>%
+ filter(AnVILorNo == "Separate from the AnVIL") %>%
+ filter(UserType != "Both Types of Users")
+```
+
+```{r}
+onAnVIL <- toPlot %>%
+ filter(AnVILorNo == "On the AnVIL") %>%
+ right_join(., toPlot_simplified,by = "Tool") %>%
+ bind_rows(.,
+ data.frame(Tool = "RStudio",
+ avgScore.x = toPlot[which(toPlot$Tool == "Bioconductor & RStudio"),"avgScore"],
+ UserType.x = "Current Users",
+ AnVILorNo.x = "On the AnVIL"),
+ data.frame(Tool = "Bioconductor",
+ avgScore.x = toPlot[which(toPlot$Tool == "Bioconductor & RStudio"),"avgScore"],
+ UserType.x = "Current Users",
+ AnVILorNo.x = "On the AnVIL")
+ ) %>% drop_na(avgScore.x)
+```
+
+
+```{r}
+roi <- toPlot[which(toPlot$Tool == "Bioconductor & RStudio"),]
+toPlot <- rows_append(toPlot, data.frame(
+ UserType = rep(roi$UserType,2),
+ avgScore = rep(roi$avgScore,2),
+ AnVILorNo = rep(roi$AnVILorNo,2),
+ Tool = c("Bioconductor", "RStudio")
+ )) %>%
+ rows_delete(., data.frame(roi))
+```
+
+
+#### Plot the dumbbell like plot
+
+Used [this Stack Overflow response](https://stackoverflow.com/a/72309061) to get the values for the `scale_shape_manual()`
+
+```{r}
+ggplot(toPlot, aes(y = reorder(Tool, avgScore), x = avgScore)) + geom_point(aes(color = UserType, shape = AnVILorNo)) + scale_x_continuous(breaks = 0:5, labels = 0:5, limits = c(0,5)) + ylab("Tool or Data Resource") + xlab("Average Knowledge or Comfort Score") + theme_bw() + theme(panel.background = element_blank(), panel.grid.minor.x = element_blank()) + #facet_wrap(~UserType, nrow=3) +
+annotation_custom(textGrob("Don't know\nat all", gp=gpar(fontsize=8, fontface = "bold")),xmin=0,xmax=0,ymin=-2,ymax=-2) +
+ annotation_custom(textGrob("Extremely\ncomfortable", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-2,ymax=-2) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm")) +
+ ggtitle("How would you rate your knowledge of or\ncomfort with these technologies or data features?") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ scale_shape_manual(values = c(4, 16))
+
+
+ggsave(here("plots/tooldataresourcecomfortscore_singlepanel.png"))
+
+simplerPlot <- ggplot(toPlot_simplified, aes(y = reorder(Tool, avgScore), x=avgScore)) + geom_point(aes(color = UserType)) +
+ geom_line() +
+ scale_x_continuous(breaks = 0:5, labels = 0:5, limits = c(0,5)) + ylab("Tool or Resource") + xlab("Average Knowledge or Comfort Score") + theme_bw() + theme(panel.background = element_blank(), panel.grid.minor.x = element_blank()) +
+ annotation_custom(textGrob("Don't know\nat all", gp=gpar(fontsize=8, fontface = "bold")),xmin=0,xmax=0,ymin=-1.5,ymax=-1.5) +
+ annotation_custom(textGrob("Extremely\ncomfortable", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-1.5,ymax=-1.5) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm"))+
+ ggtitle("How would you rate your knowledge of or\ncomfort with these technologies\n(separate from the AnVIL)?")
+
+simplerPlot
+
+ggsave(here("plots/toolsSeparateFromAnVIL_comfortscore.png"), plot = simplerPlot)
+```
+
+
+
+
+```{r}
+simplerPlot + geom_point(data = onAnVIL, aes(x=avgScore.x,y=Tool),colour="#C77CFF")
+#how to add label for what purple point is?
+
+ggsave(here("plots/tools_comfortscore.png"))
+```
+
+```{r}
+toPlot %>%
+ filter(Tool == "DUOS" | Tool == "Access controlled access data" | Tool == "TDR" | Tool == "Terra Workspaces") %>%
+ ggplot(aes(y = reorder(Tool, avgScore), x=avgScore)) + geom_point(colour = "#F8766D") +
+ scale_x_continuous(breaks = 0:5, labels = 0:5, limits = c(0,5)) + ylab("Data Resource") + xlab("Average Knowledge or Comfort Score") + theme_bw() + theme(panel.background = element_blank(), panel.grid.minor.x = element_blank()) +
+ annotation_custom(textGrob("Don't know\nat all", gp=gpar(fontsize=8, fontface = "bold")),xmin=0,xmax=0,ymin=-0.35,ymax=-0.35) +
+ annotation_custom(textGrob("Extremely\ncomfortable", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-0.35,ymax=-0.35) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm"))+
+ ggtitle("How would you rate your knowledge of or\ncomfort with these AnVIL data features?")
+
+ggsave(here("plots/dataresources_comfortscore.png"))
+```
+
+## What datasets would respondents like to access? Which ones are most popular?
+
+>What large, controlled access datasets do you access or would you be interested in accessing using the AnVIL?
+
+* All of Us*
+* Centers for Common Disease Genomics (CCDG)
+* The Centers for Mendelian Genomics (CMG)
+* Clinical Sequencing Evidence-Generating Research (CSER)
+* Electronic Medical Records and Genomics (eMERGE)
+* Gabriella Miller Kids First (GMKF)
+* Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR)
+* The Genotype-Tissue Expression Project (GTEx)
+* The Human Pangenome Reference Consortium (HPRC)
+* Population Architecture Using Genomics and Epidemiology (PAGE)
+* Undiagnosed Disease Network (UDN)
+* UK Biobank*
+* None
+* Other (Free Text Response)
+
+Since this is a select all that apply question, we expect that there will be multiple responses that are comma separated. The free text responses will likely need recoded as well. The responses are in the `AccessWhichControlledData` column.
+
+### Prepare the data
+
+Description of variable definitions and steps
+
+Making a function `prep_df_whichData()` since we'll be using this workflow a few times for different subsets of the data, because we want to be able to differentially display the data based on the experience status (experienced with clinical research, human genomics research, etc.) of the person saying they'd like access to the data.
+
+We want to color the bars based on whether or not the controlled access dataset is available on the AnVIL currently. We create a dataframe `onAnVILDF` to report this. Used the [AnVIL dataset catalog/browser](https://explore.anvilproject.org/datasets) to find out this information. However, HPRC and GREGoR don't show up in that resource, but are both available per these sources: [Announcement for HPRC](https://anvilproject.org/news/2021/03/11/hprc-on-anvil), [Access for HPRC](https://anvilproject.org/data/consortia/HPRC), [Access for GREGoR](https://anvilproject.org/data/consortia/GREGoR). Both GMKF and TCGA are data hosted on other NCPI platforms that are accessible via AnVIL because of interoperability. (See: https://www.ncpi-acc.org/ and https://ncpi-data.org/platforms). We list these as non-AnVIL hosted since while accessible, they are not AnVIL hosted and inaccessible without NCPI. Finally, UDN is described as non-AnVIL hosted as it is in the Data submission pipeline and not yet available.
+
+We'll join this anvil-hosted or not data with the actual data at the end.
+
+Given the input `subset_df`, we expect several answer to be comma separated. Since there are 12 set possible responses (not including "None") and one possible free response answer, we separate the `AccessWhichControlledData` column into 13 columns ("WhichA" through "WhichN"), separating on a comma (specifically a ", " a comma followed by a space, otherwise there were duplicates where the difference was a leading space). Alternative approaches should [consider using `str_trim`](https://stringr.tidyverse.org/reference/str_trim.html). We set fill to "right" but this shouldn't really matter. It's just to suppress the unnecessary warning that they're adding NA's when there aren't 13 responses. If there's only one response, it'll put that response in `WhichA` and fill the rest of them with `NA`. If there's two responses, it'll put those two responses in `WhichA` and `WhichB` and fill the rest of them with `NA`... etc,
+
+We then use `pivot_longer` to grab these columns we just made and put the column names in a new column `WhichChoice` and the values in the each column to a new column `whichControlledAccess`. We drop all the NAs in this new `whichControlledAccess` column (and there's a lot of them there)...
+
+Then we group by the new `whichControlledAccess` column and summarize a count for how many there are for each response.
+
+Then we pass this to a mutate and recode function to simplify the fixed responses to be just their acronyms, to remove asterisks (that let the survey respondent know that that dataset wasn't available because of policy restrictions), and to recode the free text responses (details below in "Notes on free text response recoding").
+
+We use a `left_join()` to join the cleaned data with a dataframe that specifies whether that dataset is currently available on the AnVIL or not. It's a left join rather than a full join so it's only adding the annotation for datasets that are available in the results.
+
+Finally, we return this subset and cleaned dataframe so that it can be plotted.
+
+
+
+Notes on free text response recoding
+
+There were 4 "Other" free response responses
+
+* "Being able to pull other dbGap data as needed."
+ --> We recoded this to be an "Other"
+* "GnomAD and ClinVar"
+ --> GnomAD and ClinVar are not controlled access datasets so we recoded that response to be "None"
+* "Cancer omics datasets"
+ --> We recoded this to be an "Other"
+* "TCGA"
+ --> This response was left as is since there is a controlled access tier.
+
+
+
+```{r}
+prep_df_whichData <- function(subset_df){
+
+ onAnVILDF <- data.frame(whichControlledAccess = c( #checking on this
+ "All of Us",
+ "UK Biobank",
+ "CCDG",
+ "CMG",
+ "CSER",
+ "eMERGE",
+ "GMKF",
+ "GREGoR",
+ "GTEx",
+ "HPRC",
+ "PAGE",
+ "UDN",
+ "Other",
+ "None",
+ "TCGA"
+ ),
+onAnVIL= c(
+ "non-AnVIL hosted",
+ "non-AnVIL hosted",
+ "AnVIL hosted",
+ "AnVIL hosted",
+ "AnVIL hosted",
+ "AnVIL hosted",
+ "non-AnVIL hosted",
+ "AnVIL hosted",
+ "AnVIL hosted",
+ "AnVIL hosted",
+ "AnVIL hosted",
+ "non-AnVIL hosted",
+ NA,
+ NA,
+ "non-AnVIL hosted"
+ )
+)
+
+ subset_df %<>% separate(AccessWhichControlledData, c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG", "WhichH", "WhichI", "WhichJ", "WhichK", "WhichM", "WhichN"), sep = ", ", fill="right") %>%
+ pivot_longer(starts_with("Which"), names_to = "WhichChoice", values_to = "whichControlledAccess") %>%
+ drop_na(whichControlledAccess) %>%
+ group_by(whichControlledAccess) %>% summarize(count = n()) %>%
+ mutate(whichControlledAccess =
+ recode(whichControlledAccess,
+ "All of Us*" = "All of Us",
+ "UK Biobank*" = "UK Biobank",
+ "Centers for Common Disease Genomics (CCDG)" = "CCDG",
+ "The Centers for Mendelian Genomics (CMG)" = "CMG",
+ "Clinical Sequencing Evidence-Generating Research (CSER)" = "CSER",
+ "Electronic Medical Records and Genomics (eMERGE)" = "eMERGE",
+ "Gabriella Miller Kids First (GMKF)" = "GMKF",
+ "Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR)" = "GREGoR",
+ "The Genotype-Tissue Expression Project (GTEx)" = "GTEx",
+ "The Human Pangenome Reference Consortium (HPRC)" = "HPRC",
+ "Population Architecture Using Genomics and Epidemiology (PAGE)" = "PAGE",
+ "Undiagnosed Disease Network (UDN)" = "UDN",
+ "Being able to pull other dbGap data as needed." = "Other",
+ "Cancer omics datasets" = "Other",
+ "GnomAD and ClinVar" = "None", #not controlled access
+ )
+ ) %>% left_join(onAnVILDF, by="whichControlledAccess")
+
+ return(subset_df)
+}
+```
+
+Description of variable definitions and steps
+
+Here we set up 4 data frames for plotting
+
+* The first uses all of the responses and sends them through the `prep_df_whichData()` function to clean the data for plotting to see which controlled access datasets are the most popular.
+* The second filters to grab just the responses from those experienced in clinical research using the `clinicalFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection)
+* The third filters to grab just the responses from those experienced in human genomic research using the `humanGenomicFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection)
+* The fourth filters to grab just the responses from those experienced in non-human genomic research using the `nonHumanGenomicFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection)
+
+
+
+```{r}
+whichDataDf <- resultsTidy %>% prep_df_whichData()
+
+whichDataClinicalSubset <- resultsTidy %>%
+ filter(clinicalFlag == TRUE) %>%
+ prep_df_whichData()
+
+whichDataHumanGenomicSubset <- resultsTidy %>%
+ filter(humanGenomicFlag == TRUE) %>%
+ prep_df_whichData()
+
+whichDataNonHumanGenomicSubset <- resultsTidy %>%
+ filter(nonHumanGenomicFlag == TRUE) %>%
+ prep_df_whichData()
+
+```
+
+### Plot the data
+
+Description of variable definitions and steps
+
+Also have a function here because it's the same plotting steps for each just changing the subtitle and which dataframe is used as input.
+
+This takes the input dataframe and plots a bar plot with the x-axis having the controlled access datasets listed (reordering the listing based off of the count so most popular is on the left), the count number/popularity of requested is on the y-axis, and the fill is based on whether the dataset is available on AnVIL or not.
+
+We change the theme elements like removing panel borders, panel background, and panel grid, and rotate the x-axis tick labels. We add an x- and y- axis label and add a title (and subtitle if specified - which it will be when we're looking at just a subset like those who are experienced with clinical data)
+
+We also add text labels above the bars to say how many times each dataset was marked/requested. Note that we have to use the after_stat, summary, and sum way of doing it again because we use recoding and if we want the labels to be accurate, it has to capture every time we've recoded things to be the same after we used group_by and summarize to count before we recoded. It uses `coord_cartesian(clip = "off")` so these bar text labels aren't cut off and finally returns the plot.
+
+We call this function 4 times
+
+* once for all the data (and don't use a subtitle)
+* next for just those experienced with clinical data (using a subtitle to specify this)
+* next for just those experienced with human genomic data (using a subtitle to specify this)
+* and finally for just those experienced with non-human genomic data (using a subtitle to specify this)
+
+
+
+```{r}
+
+plot_which_data <- function(inputToPlotDF, subtitle = NULL){
+
+ toreturnplot <- ggplot(inputToPlotDF, aes(x = reorder(whichControlledAccess, -count), y = count, fill = onAnVIL)) +
+ geom_bar(stat="identity") +
+ theme_classic() + theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle=45, hjust=1)) +
+ xlab("Controlled access datasets") + ylab("Count") +
+ ggtitle("What large, controlled access datasets do you access\nor would you be interested in accessing using the AnVIL?", subtitle = subtitle) +
+ geom_text(aes(label = after_stat(y), group = whichControlledAccess),
+ stat = 'summary', fun = sum, vjust = -1, size=2) +
+ coord_cartesian(clip = "off") +
+ scale_fill_manual(values = c("#25445A", "#7EBAC0", "grey"))
+
+return(toreturnplot)
+
+}
+
+```
+
+```{r}
+everyoneDataPlot <- plot_which_data(whichDataDf)
+
+everyoneDataPlot
+
+ggsave(here("plots/whichcontrolleddata.png"), plot = everyoneDataPlot)
+```
+
+```{r}
+clinicalDataPlot <- plot_which_data(whichDataClinicalSubset, subtitle = "Respondents moderately or extremely experienced with clinical data")
+
+clinicalDataPlot
+
+ggsave(here("plots/whichcontrolleddata_clinical.png"), plot = clinicalDataPlot)
+```
+
+```{r}
+humanGenomicDataPlot <- plot_which_data(whichDataHumanGenomicSubset, subtitle = "Respondents moderately or extremely experienced with human genomic data")
+
+humanGenomicDataPlot
+
+ggsave(here("plots/whichcontrolleddata_humangenomic.png"), plot = humanGenomicDataPlot)
+```
+
+```{r}
+nonHumanGenomicDataPlot <- plot_which_data(whichDataNonHumanGenomicSubset, subtitle = "Respondents moderately or extremely experienced with non-human genomic data")
+
+nonHumanGenomicDataPlot
+
+ggsave(here("plots/whichcontrolleddata_nonhumangenomic.png"), plot = nonHumanGenomicDataPlot)
+```
+
+## Types of Data respondents would want to analyze on the AnVIL
+
+Question and possible answers
+
+>What types of data do you or would you analyze using the AnVIL?
+
+Possible answers include
+
+* Genomes/exomes
+* Transcriptomes
+* Metagenomes
+* Proteomes
+* Metabolomes
+* Epigenomes
+* Structural
+* Single Cell
+* Imaging
+* Phenotypic
+* Electronic Health Record
+* Metadata
+* Survey
+* Other (with free text response)
+
+
+
+Description of variable definitions and steps
+
+Because the responses for this data are going to look very similar to the controlled access dataset question (above), we'll follow similar reasoning here in how we prepare and plot the data.
+
+
+
+### Prepare the data
+
+```{r}
+prep_df_typeData <- function(subset_df){
+ subset_df %<>% separate(TypesOfData, c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG", "WhichH", "WhichI", "WhichJ", "WhichK", "WhichM", "WhichN", "WhichO"), sep = ", ", fill="right") %>%
+ pivot_longer(starts_with("Which"), names_to = "WhichChoice", values_to = "whichTypeData") %>%
+ drop_na(whichTypeData) %>%
+ group_by(whichTypeData) %>% summarize(count = n()) %>%
+ mutate(whichTypeData =
+ recode(whichTypeData,
+ "I don't analyze data on AnVIL" = NA_character_,
+ "I store data in AnVIL. I don’t analyze it." = NA_character_,
+ "Used in training for analysis of genomes (variant calling)" = "Variant Calling"
+ )
+ ) %>%
+ drop_na(whichTypeData)
+ return(subset_df)
+}
+```
+
+```{r}
+typeOfDataDf <- resultsTidy %>% prep_df_typeData()
+
+typeDataClinicalSubset <- resultsTidy %>%
+ filter(clinicalFlag == TRUE) %>%
+ prep_df_typeData()
+
+typeDataHumanGenomicSubset <- resultsTidy %>%
+ filter(humanGenomicFlag == TRUE) %>%
+ prep_df_typeData()
+```
+
+### Plot the data
+
+```{r}
+
+plot_type_data <- function(inputToPlotDF, subtitle = NULL){
+ toreturnplot <- ggplot(inputToPlotDF, aes(x = reorder(whichTypeData, -count), y = count)) +
+ geom_bar(stat="identity") +
+ theme_classic() + theme(panel.background = element_blank(), panel.grid = element_blank()) +
+ theme(axis.text.x = element_text(angle=45, hjust=1)) +
+ xlab("Types of data") + ylab("Count") +
+ ggtitle("What types of data do you or would you analyze using the AnVIL?", subtitle = subtitle) +
+ geom_text(aes(label = after_stat(y), group = whichTypeData),
+ stat = 'summary', fun = sum, vjust = -1, size=2) +
+ coord_cartesian(clip = "off")
+ return(toreturnplot)
+}
+```
+
+```{r}
+everyone_type_data <- plot_type_data(typeOfDataDf)
+
+everyone_type_data
+
+ggsave(here("plots/typesOfData.png"), plot=everyone_type_data)
+```
+
+```{r}
+clinical_type_data <- plot_type_data(typeDataClinicalSubset, subtitle = "Respondents moderately or extremely experienced with clinical data")
+
+clinical_type_data
+
+ggsave(here("plots/typesOfData_clinical.png"), plot=clinical_type_data)
+```
+
+```{r}
+humangenomic_type_data <- plot_type_data(typeDataHumanGenomicSubset, subtitle = "Respondents moderately or extremely experienced with human genomic data")
+
+humangenomic_type_data
+
+ggsave(here("plots/typesOfData_humangenomic.png"), plot=humangenomic_type_data)
+```
+
+## Source of funds for cloud computing
+
+> What source(s) of funds do you use to pay for cloud computing?
+
+Possible answers include
+
+* NHGRI
+* Other NIH
+* Foundation Grant
+* Institutional funds
+* Don't know
+* Only use free options
+* Other (with free text entry if Other is selected)
+
+The only Other response in this set of responses is NSF.
+
+Answers are stored in the `FundingSources` column. This question was a select all that apply, so answers will be comma separated, and this question was asked to all survey takers.
+
+### Prepare the data
+
+Description of variable definitions and steps
+
+
+
+```{r}
+toPlotFundingSource <- resultsTidy %>% separate(FundingSources, c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG"), sep = ", ", fill="right") %>%
+ pivot_longer(starts_with("Which"), names_to = "WhichChoice", values_to = "whichFundingSource") %>%
+ drop_na(whichFundingSource) %>%
+ group_by(whichFundingSource, UserType) %>% summarize(count = n())
+```
+
+### Plot the data
+
+Description of variable definitions and steps
+
+
+
+```{r}
+
+toPlotFundingSource %>% ggplot(aes(y = reorder(whichFundingSource,count), x = count, fill = UserType)) +
+ geom_bar(position = "stack", stat = "identity") +
+ scale_fill_manual(values = c("#E0DD10", "#035C94")) +
+ theme_bw() +
+ ggtitle("What source(s) of funds do you use to pay for cloud computing?") +
+ xlab("Count") +
+ ylab("Funding Source") +
+ theme(panel.background = element_blank(),
+ panel.grid.minor.x = element_blank(),
+ panel.grid.minor.y = element_blank(),
+ panel.grid.major.y = element_blank())
+
+ggsave(here("plots/fundingsources.png"))
+```
+
+```{r}
+toPlotFundingSource %>%
+ mutate(UserType = case_when(
+ UserType == "CurrentUser" ~ "Current",
+ UserType == "PotentialUser" ~ "Potential"
+ ),
+ whichFundingSource = factor(whichFundingSource, levels = rev(c("NHGRI", "Other NIH", "Institutional funds", "Foundation Grant", "NSF", "Only use free options", "Don't know")))
+ ) %>%
+ ggplot(aes(y = UserType, x = count, fill = whichFundingSource)) +
+ geom_bar(position = "fill", stat = "identity") +
+ scale_fill_manual(values = rev(c("#035C94", "#012840", "#F2F2F2", "#E0DD10", "#AEEBF2", "#7EBAC0", "#333333"))) +
+ theme_bw() +
+ ggtitle("What source(s) of funds do you use to pay for cloud computing?") +
+ xlab("Fraction of responses") +
+ ylab("User Type") +
+ theme(panel.background = element_blank(),
+ panel.grid.minor.x = element_blank(),
+ panel.grid.minor.y = element_blank(),
+ panel.grid.major.y = element_blank()) +
+ labs(fill="Funding Source")
+
+
+ggsave(here("plots/fundingsources_colorSource.png"))
+```
+
+## Preference for training modality
+
+>Please rank how/where you would prefer to attend AnVIL training workshops.
+
+Possible answers include
+
+* On-site at my institution: `AnVILTrainingWorkshopsOnSite`
+* Virtual: `AnVILTrainingWorkshopsVirtual`
+* Conference (e.g., CSHL, AMIA): `AnVILTrainingWorkshopsConference`
+* AnVIL-specific event: `AnVILTrainingWorkshopsSpecEvent`
+* Other: `AnVILTrainingWorkshopsOther`
+
+The responses are stored in the starts with `AnVILTrainingWorkshops` columns
+
+
+### Prepare the data
+
+```{r}
+
+resultsTidy %<>%
+ mutate(across(starts_with(
+ "AnVILTrainingWorkshops"), as.character)) %>%
+ unnest(starts_with("AnVILTrainingWorkshops"), keep_empty = TRUE) %>%
+ mutate(across(
+ starts_with("AnVILTrainingWorkshops"),
+ ~ recode(
+ .x,
+ "1 (Most preferred in this list)" = "1",
+ "5 (Least preferred in this list)" = "5",
+ "NULL" = NA_character_
+ )
+ )) %>%
+ mutate(across(starts_with("AnVILTrainingWorkshop"), as.integer))
+
+```
+
+```{r}
+toPlotTrainingRanks <- bind_rows(
+ resultsTidy %>%
+ filter(UserType == "CurrentUser") %>%
+ select(starts_with("AnVILTrainingWorkshops")) %>%
+ colSums(na.rm = TRUE) %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "CurrentUser"),
+ avgRank = totalRank / nranks,
+ UserType = "Current Users") %>%
+ mutate(TrainingType = rownames(.)) %>%
+ mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", "")),
+ resultsTidy %>%
+ filter(UserType == "PotentialUser") %>%
+ select(starts_with("AnVILTrainingWorkshops")) %>%
+ colSums() %>%
+ as.data.frame() %>% `colnames<-`(c("totalRank")) %>%
+ mutate(nranks = sum(resultsTidy$UserType == "PotentialUser"),
+ avgRank = totalRank / nranks,
+ UserType = "Potential Users") %>%
+ mutate(TrainingType = rownames(.)) %>%
+ mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", ""))
+ ) %>% mutate(TrainingType = recode(TrainingType, "SpecEvent" = "AnVIL-specific event", "OnSite" = "On-site at my institution", "Conference" = "Conference (e.g., CSHL, AMIA)")) %>%
+ mutate(UserType = factor(UserType, levels = c("Potential Users", "Current Users")))
+
+```
+
+### Plot the data
+
+```{r}
+tdumbbell <- ggplot(toPlotTrainingRanks, aes(x = avgRank, y = reorder(TrainingType, -avgRank))) +
+ geom_line() +
+ geom_point(aes(color = UserType), size = 3) +
+ theme(panel.background = element_blank()) + theme_bw() + theme(legend.position = "bottom") +
+ xlab("Average Rank") +
+ ylab("Training Workshop Modality") +
+ ggtitle("Please rank how/where you would prefer to attend\nAnVIL training workshops.") +
+ scale_color_manual(values = c("#E0DD10", "#035C94")) +
+ theme(legend.title=element_blank())
+
+tdumbbell
+
+ggsave(here("plots/dumbbellplot_trainingmodalitypref.png"), plot = tdumbbell)
+
+tdumbbell <- tdumbbell +
+ scale_x_continuous(breaks = 5:1, labels = 5:1, limits = c(1,5))+
+ annotation_custom(textGrob("Most\npreferred", gp=gpar(fontsize=8, fontface = "bold")),xmin=1,xmax=1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\npreferred", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-0.5,ymax=-0.5) +
+ coord_cartesian(clip = "off") +
+ theme(plot.margin = margin(1,1,1,1.1, "cm"))
+
+
+tdumbbell
+
+ggsave(here("plots/dumbbellplot_xlim15_trainingmodalitypref.png"), plot = tdumbbell)
+
+tdumbbell <- tdumbbell +
+ scale_x_reverse(limits = c(5,1)) +
+ annotation_custom(textGrob("Most\npreferred", gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) +
+ annotation_custom(textGrob("Least\npreferred", gp=gpar(fontsize=8, fontface= "bold")),xmin=-5,xmax=-5,ymin=-0.5,ymax=-0.5)
+
+tdumbbell
+
+ggsave(here("plots/dumbbellplot_xlim15_revaxis_trainingmodalitypref.png"), plot = tdumbbell)
+
+```
+
+## Session Info
+
+```{r}
+sessionInfo()
+```
+
diff --git a/style.Rmd b/style.Rmd
deleted file mode 100644
index 39bb2f8..0000000
--- a/style.Rmd
+++ /dev/null
@@ -1,52 +0,0 @@
----
-title: "Style"
-output: html_document
----
-
-## Navigation bar
-
-To change the part of the navigation bar that says "OTTR Web", modify the title within the `_site.yml` file.
-
-
-```{r, fig.align='center', fig.alt= "Change nav bar", echo = FALSE, out.width="40%"}
-knitr::include_graphics("resources/images/navbar.png")
-```
-
-
-## Overall theme
-
-To change the color scheme/fonts of the website modify the `theme` in the `_site.yml` file (see [here](https://bookdown.org/yihui/blogdown/themes.html) for options):
-
-```{r, fig.align='center', fig.alt= "Change theme", echo = FALSE, out.width="40%"}
-knitr::include_graphics("resources/images/theme.png")
-```
-
-## Change the favicon
-
-The small image that shows up on the browser can also be changed.
-
-You can make a small image to replace the existing one by going to https://favicon.io/favicon-converter/ and uploading an image that you would like.
-
-Next, simply replace the image called `favicon.ico` in the `images` directory within the `resources` directory with the image you just created and downloaded from the favicon converter website.
-
-## Additional changes
-
-To make additional changes to the style, you can modify the `styles.css` file with css code. This [website](https://www.w3schools.com/css/) has great information about css code.
-
-As an example if you wanted to change the color of the blue line to green you could change where it says `lightblue` to `lightgreen` in the `styles.css` file. You can also use a hex color code like those that can be found at this [website](https://htmlcolorcodes.com/), such as `#00FF9E` to get a specific shade.
-
-
-```{r, fig.align='center', fig.alt= "Change color of line", echo = FALSE, out.width="80%"}
-knitr::include_graphics("resources/images/blue.png")
-```
-
-Note that if you change the css file with a new element that is not already defined like `body` then you would need to do it as done with the `banner` element. This was then added to the index.Rmd file by using:
-
-```
-
-Banner text!
-
-```
-
-Also checkout the [R Markdown cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf) for more customization of the pages.
-