diff --git a/.github/switch_sync_repo.R b/.github/switch_sync_repo.R deleted file mode 100644 index 7975abf..0000000 --- a/.github/switch_sync_repo.R +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env Rscript - -# This script switches the repo entry for the yaml file to whatever is specified -# Written by Candace Savonen Jan 2022 - -if (!("optparse" %in% installed.packages())){ - install.packages("optparse") -} - -library(optparse) - -option_list <- list( - optparse::make_option( - c("--repo"), - type = "character", - default = "jhudsl/OTTR_Template_Test", - help = "GitHub repository name, e.g. jhudsl/OTTR_Template_Test", - ) -) - -# Read the arguments passed -opt_parser <- optparse::OptionParser(option_list = option_list) -opt <- optparse::parse_args(opt_parser) - -# Find .git root directory -root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) - -# Get test sync yaml path -sync_file_path <- file.path(root_dir, ".github", "test-sync.yml") - -yaml_contents <- yaml::yaml.load_file(sync_file_path) - -# Only keep first grouping -yaml_contents$group <- yaml_contents$group[[1]] - -# Switch out repo -yaml_contents$group$repos <- opt$repo - -yaml::write_yaml(yaml_contents, sync_file_path) diff --git a/.github/sync.yml b/.github/sync.yml deleted file mode 100644 index 27510d2..0000000 --- a/.github/sync.yml +++ /dev/null @@ -1,49 +0,0 @@ -# Candace Savonen Aug 2021 -# For info on how to update this file see: https://github.com/marketplace/actions/repo-file-sync-action#%EF%B8%8F-sync-configuration - -group: - - files: - - source: .github/workflows/ - dest: .github/workflows/ - deleteOrphaned: true - exclude: | - send-updates.yml - test-send-updates.yml - starting-course.yml - release-notes.yml - - source: .github/workflows/delete-preview.yml - dest: .github/workflows/delete-preview.yml - - source: .github/workflows/render-site.yml - dest: .github/workflows/render-site.yml - - source: .github/workflows/pull_request.yml - dest: .github/workflows/pull_request.yml - - source: config_automation.yml - dest: config_automation.yml - # Repositories to receive changes - repos: | - jhudsl/ottrproject.org - jhudsl/Baltimore_Community_Course - maculatus/test-ottr-website - dr-sayyadhury/OTTR_Template_Website_repo - whalera1901/Current-projects - GenetcXBiotech1/Dr.Fierst_lab - buriedsand/glbio-personal-website - PurplFeesh/test-ottr-site - jcha40/ottr_test_site - jhudsl/ITN_computing_resources - fhdsl/ITN_course_search - cansavvy/cansavvy_website - fhdsl/metricminer-dashboard - fhdsl/Intermediate_R -###ADD NEW REPO HERE following the format above# - -### These are custom groups for syncing -- not all files needs to be synced # will update later - # - files: - # - source: config_automation.yml - # dest: config_automation.yml - # - source: .github/workflows/pull-request.yml - # dest: .github/workflows/pull-request.yml - # - source: scripts/spell-check.R - # dest: scripts/spell-check.R - # repos: | - # jhudsl/Baltimore_Community_Course diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 0cf82f4..aa5c9e0 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -95,14 +95,27 @@ jobs: git merge -s recursive --strategy-option=theirs origin/${{ github.head_ref }} --allow-unrelated-histories shell: bash + # Set up / install jq so that json credentials can be read in + - name: Install jq + uses: dcarbone/install-jq-action@v2.1.0 + # We want a fresh run of the renders each time - so first delete old html files - name: Delete old *.html - run: Rscript -e "rmarkdown::clean_site(preview = FALSE)" + run: Rscript -e "rmarkdown::clean_site(input = 'pages', preview = FALSE)" # Now we want to render all the html files from the Rmd files - name: Run render html id: site - run: Rscript -e "rmarkdown::render_site()" + run: | + if [ ! -d .secrets ]; then + mkdir .secrets + fi + jq -n '${{ secrets.GS_SA_KEY }}' > .secrets/${{ secrets.GS_SA_FILENAME }} + if test -f .secrets/${{ secrets.GS_SA_FILENAME }}; then + echo "Key created!" + fi + Rscript --vanilla "resources/render.R" + rm -rf .secrets # This checks on the steps before it and makes sure that they completed. # If the renders didn't complete we don't want to commit the file changes @@ -124,7 +137,7 @@ jobs: echo "changes=$changes" >> $GITHUB_OUTPUT git add . --force git commit -m 'Render preview' || echo "No changes to commit" - git pull --set-upstream origin $branch_name --allow-unrelated-histories --strategy-option=ours + git pull --rebase --set-upstream origin $branch_name --allow-unrelated-histories --strategy-option=ours git push --force || echo "No changes to commit" shell: bash diff --git a/.github/workflows/render-site.yml b/.github/workflows/render-site.yml index 7f74ea2..023582b 100644 --- a/.github/workflows/render-site.yml +++ b/.github/workflows/render-site.yml @@ -52,14 +52,27 @@ jobs: # use github PAT token: ${{ secrets.GH_PAT }} + # Set up / install jq so that json credentials can be read in + - name: Install jq + uses: dcarbone/install-jq-action@v2.1.0 + # We want a fresh run of the renders each time - so first delete old html files - name: Delete old *.html - run: Rscript -e "rmarkdown::clean_site(preview = FALSE)" + run: Rscript -e "rmarkdown::clean_site(input = 'pages', preview = FALSE)" # Now we want to render all the html files from the Rmd files - name: Run render html id: site - run: Rscript -e "rmarkdown::render_site()" + run: | + if [ ! -d .secrets ]; then + mkdir .secrets + fi + jq -n '${{ secrets.GS_SA_KEY }}' > .secrets/${{ secrets.GS_SA_FILENAME }} + if test -f .secrets/${{ secrets.GS_SA_FILENAME }}; then + echo "Key created!" + fi + Rscript --vanilla "resources/render.R" + rm -rf .secrets # This checks on the steps before it and makes sure that they completed. # If the renders didn't complete we don't want to commit the file changes @@ -77,4 +90,6 @@ jobs: git config --global user.email 'github-actions[bot]@users.noreply.github.com' git add --force docs/* git commit -m 'Render site' || echo "No changes to commit" + git reset --hard HEAD + git pull --rebase --allow-unrelated-histories --strategy-option=ours git push origin main || echo "No changes to push" diff --git a/.github/workflows/send-updates.yml b/.github/workflows/send-updates.yml deleted file mode 100644 index d41b15b..0000000 --- a/.github/workflows/send-updates.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Candace Savonen Aug 2021 - -name: Sync Files - -on: - release: - types: - - published - workflow_dispatch: - inputs: - prtag: - description: 'Tag to use?' - required: true - default: 'null' - -jobs: - sync: - runs-on: ubuntu-latest - steps: - - name: Checkout Repository - uses: actions/checkout@master - - - name: Login as jhudsl-robot - run: | - git config --global --add safe.directory $GITHUB_WORKSPACE - git config --global user.email "itcrtrainingnetwork@gmail.com" - git config --global user.name "jhudsl-robot" - - - name: Get the version - id: get_tag - run: | - if [ github.event.inputs.prtag == 'null' ] - then - echo "version=$(echo $GITHUB_REF | cut -d / -f 3)" >> $GITHUB_OUTPUT - fi - if [ github.event.inputs.prtag != 'null' ] - then - echo "version=${{ github.event.inputs.prtag }}" >> $GITHUB_OUTPUT - fi - - - name: Run Mechanics File Sync - uses: BetaHuhn/repo-file-sync-action@v1.17.21 - with: - GH_PAT: ${{ secrets.GH_PAT }} - COMMIT_BODY: release-${{ steps.get_tag.outputs.version }} diff --git a/.github/workflows/test-send-updates.yml b/.github/workflows/test-send-updates.yml deleted file mode 100644 index 8c688eb..0000000 --- a/.github/workflows/test-send-updates.yml +++ /dev/null @@ -1,40 +0,0 @@ -# Candace Savonen Aug 2021 - -name: Test Sync Files - -on: - workflow_dispatch: - inputs: - repo: - description: 'What repo to test on e.g. jhudsl/OTTR_Template_Test' - required: true - default: 'jhudsl/OTTR_Template_Test' - -jobs: - test-sync: - runs-on: ubuntu-latest - container: - image: jhudsl/base_ottr:main - - steps: - - name: Checkout Repository - uses: actions/checkout@master - - - name: Login as jhudsl-robot - run: | - git config --system --add safe.directory "$GITHUB_WORKSPACE" - git config --local user.email "itcrtrainingnetwork@gmail.com" - git config --local user.name "jhudsl-robot" - - - name: Set up test-sync.yml - run: | - cp .github/sync.yml .github/test-sync.yml - # Switch out repo argument - Rscript --vanilla .github/switch_sync_repo.R --repo ${{ github.event.inputs.repo }} - - - name: Run Mechanics File Sync - uses: BetaHuhn/repo-file-sync-action@v1.17.21 - with: - GH_PAT: ${{ secrets.GH_PAT }} - COMMIT_BODY: "test-run" - CONFIG_PATH: .github/test-sync.yml diff --git a/.gitignore b/.gitignore index 5b6a065..bc47b05 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ .Rhistory .RData .Ruserdata +.secrets/* +.DS_Store +*/.DS_Store \ No newline at end of file diff --git a/IdentifyTypeOfUsers.Rmd b/IdentifyTypeOfUsers.Rmd new file mode 100644 index 0000000..8fb66ba --- /dev/null +++ b/IdentifyTypeOfUsers.Rmd @@ -0,0 +1,52 @@ +--- +title: "Identify current vs potential users" +author: "" +date: "" +output: html_document +--- + +```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE} +library(tidyverse) +library(here) + +knitr::knit_child("TidyData.Rmd") #inherit resultsTidy +``` + +```{r, message=FALSE, echo=FALSE} +resultsTidy %>% + group_by(UserType, CurrentUsageDescription) %>% + summarize(count = n()) %>% + mutate(CurrentUsageDescription = case_when( + CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "For ongoing projects:\nconsistent project development\nand/or work", + CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "For completed/long-term projects:\noccasional updates/maintenance\nas needed", + CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "For short-term projects:\nshort, intense bursts\nseparated by a few months", + CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "I do not current use the AnVIL,\nbut have in the past", + CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "I have never\nused the AnVIL", + CurrentUsageDescription == "I have never heard of the AnVIL" ~ "I have never\nheard of the AnVIL" + )) %>% + ggplot(aes(x = count, y = reorder(CurrentUsageDescription, count), fill = UserType)) + + scale_fill_manual(values = c("#E0DD10", "#035C94")) + + geom_bar(stat="identity", position ="stack") + + theme_classic() + + xlab("Count") + + ylab("") + + ggtitle("How would you describe your current usage\nof the AnVIL platform?") + + geom_text(aes(label = count, group = CurrentUsageDescription), + hjust = -0.5, size=2) + + theme(legend.title = element_blank()) +``` + +## Takeaway + +Of the ```r nrow(resultsTidy)``` responses, ```r nrow(resultsTidy %>% filter(UserType == "Current User"))``` were current users and ```r nrow(resultsTidy %>% filter(UserType == "Potential User"))``` were potential users. The majority of current users belonged to the group who use the AnVIL for ongoing projects while the majority of potential users were evenly split between those who have never used the AnVIL (but have heard of it) and those who used to previously use the AnVIL, but don't currently. + +## Potential Follow-ups: + +- Look to see if those potential users who previously used to use the AnVIL show similarity in overall trends with the rest of the potential users +- Directly ask why they no longer use the AnVIL + +
Description of variable definitions and steps in preparing and plotting the data + +First, we group the data by the assigned `UserType` labels/categories and their related more detailed descriptions. Then we use `summarize` to count the occurrences for each of those categories. We use a mutate statement to better fit the detailed descriptions on the plot. We then send this data to ggplot with the count on the x-axis, and the usage descriptions on the y-axis (ordered by count so highest count is on the top). We fill with the user type description we've assigned. We manually scale the fill to be AnVIL colors and specify we want this to be a stacked bar chart. We then make edits for the theme and labels and finally add a geom_text label for the count next to the bars before we save the plot. + +
diff --git a/README.md b/README.md index 223e26c..6eedf80 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,19 @@ -# OTTR for Websites! +# Analysis of the State of the AnVIL 2024 Poll +Analysis of the State of the AnVIL 2024 Poll -Get started by going to [ottrproject.org](https://www.ottrproject.org/getting_started.html)! +## Demographics -This is a template for creating websites from Rmd files hosted on GitHub with three helpful automations following a pull request to the repository: spelling check, broken link check, and website rendering. +### Institutional affiliation -- Check for spelling errors more intensively than RStudio and allow you to add words to the dictionary -- Check for broken links - you will be warned about broken links -- Automatic rendering of the website for previewing before merges -- Automatic rendering of the website upon merging to main -- Docker images that can be customized. +![Institutional affiliation](https://github.com/fhdsl/anvilPoll2024/blob/3c6a05514355bce61033481470f940645928b83e/plots/institutionalType_faceteduserType.png) +### Highest Degree attained +![Highest Degree attained](plots/degree_usertype.png) + +### Genomics and Clinical Research Experience +![Research Experience](https://github.com/fhdsl/anvilPoll2024/blob/9178a9e2ca527eab98b8caeb3b31346e916113ab/plots/researchExperienceLevel_colorExperienceLevel_noUserTypeSplit.png) + +## Insights + +### Feature importance for current vs potential users +![Feature importance dumbbell plot](plots/dumbbellplot_xlim16_rankfeatures.png) diff --git a/TidyData.Rmd b/TidyData.Rmd new file mode 100644 index 0000000..2d37a14 --- /dev/null +++ b/TidyData.Rmd @@ -0,0 +1,610 @@ +--- +title: "Tidy Data" +author: "Kate Isaac, Elizabeth Humphries, & Ava Hoffman" +date: "`r Sys.Date()`" +output: html_document +--- + +```{r, message=FALSE} +library(googlesheets4) +library(tidyverse) +library(magrittr) #for %<>% +library(here) +library(grid) #for Grobs and unit() +library(scales) #pretty breaks +``` + +# Read in data + +Data were read in via a Google Sheet on the AnVIL Team Drive. + +
Import details +The google sheet we are reading in is stored in an AnVIL Google drive folder `State of the AnVIL 2024`. Its permissions are restricted such that only people with access can open with the link. Using `gs4_auth()` to authorize my google account before running this code, I needed to change the `scopes` argument, specifically `scopes=spreadsheets.readonly` was necessary. + +In this google sheet, each question is a column, and each response to the survey is a row. If the respondent wasn't asked or didn't answer a specific question, there is an NA in the corresponding row/column. + +```{r, eval=FALSE, echo=FALSE} +gs4_auth(email = TRUE) +``` + +```{r, echo=FALSE, message=FALSE} +resultsRaw <- + googlesheets4::read_sheet( + "https://docs.google.com/spreadsheets/d/1wDMNC6BD2AaIwh_GOkPTpl1tvAyLwVBQgAvOD2rYrX0/edit?usp=sharing", + na = c("NA", "na", "")) +``` + +
+ +# Clean data + +**Note:** Every code block in this section edits the `resultsTidy` data frame and should be run before plotting within the `# Insights` section below. Subsections are marked according to which Insight they are related to, but cleaning steps like identifying the user type are important for most every plot. + +## Set Column Names + +We set the column names to simplified column names (e.g., that help us select related columns for various analyses) by reading in a codebook (`data/codebook.txt`). + +
Simplifying column names details + +
Description of variable definitions and steps + +We have a codebook that is a tab delimited file and has 4 columns, and each row represents a question in the survey. The first column lists a/the question from the survey (`SurveyColNames`); the second column lists a corresponding simplified column name for that survey question (`SimplifedColNames`); the third column describes the variable format (`VariableFormat`), e.g, is it a double, or a character; the fourth column gives a lengthier description of the question (`Description`), e.g., who was asked it, what possible answers are, etc. + +This code block reads in that codebook and specifically selects the `SimplifiedColNames` column. It then renames the column names of the raw results from the google sheet (where each question is a column) with these simplified column names. + +
+ +```{r, message=FALSE} +simplifiedColNames <- + read_delim(here("data/codebook.txt"), + delim = "\t", + col_select = SimplifiedColNames) +resultsTidy <- + resultsRaw %>% `colnames<-`(unlist(simplifiedColNames)) +``` +
+ +## Keep last response if duplicated according to email (if email provided) + +Choosing to select the last response because the respondent may have spent more time thinking about how they wanted to respond after their initial response. + +
Filtering duplicated responses details + +
Description of variable definitions and steps + +* The `table` function tabulates the number of occurrences, and we tell it to ignore literal NAs. Because providing an email was optional, we expect many NA responses. The `table` function, by ignoring NAs, will return the unique emails and the number of times each email was used. We store the tabulated results in the variable `tabulatedEmails` +* Using the `sum` function, we look to see how many emails/responses are provided more than once. `tabulatedEmails > 1` is returning a vector of TRUEs and FALSEs where TRUE means that there was more than one instance/count of a given email and FALSE means there wasn't. The `sum` function in essence counts the number of TRUEs and if the `sum` is greater than 0, that means there is at least one duplicated email whose count is greater than 1. +* `duplicatedEmails` reports which emails are duplicated by using the tabulated/table of emails. First it identifies which emails were observed more than once, using the `which` function, and uses the indices returned from that to index the `names` of the tabulated emails, grabbing the specific emails. +* We want to know which entries from the overall survey responses to remove for each duplicated email. Ideally, we want to remove the responses all at the same time or go backwards removing one at a time, because we don't want to affect downstream indices. The approach here, keeps track of all the indices of interest and removed them at the same time. + * Therefore, we'll use `lapply` to loop through the duplicated emails (`duplicatedEmails`) and grab the index for survey responses associated with that email address (`which(resultsTidy$Email == duplicatedEmails[x])`). + * However, we want to keep the last survey response for each duplicated email. Therefore, we wrap that `which` function in `head(_,-1 )` function so that it grabs all indices except the last one. + * Finally, we `unlist` the indices so that there's a single vector associated with indices for any duplicated email responses to be removed `IDXs_to_remove`. And since we want to remove them all at the same time, we subset `resultsTidy`, grabbing every row except those in `IDXs_to_remove`, as denoted by the `-`. + +
+ +```{r} + +tabulatedEmails <- table(resultsTidy$Email, useNA = "no") + +if (sum(tabulatedEmails > 1) > 0) { + duplicatedEmails <- + names(tabulatedEmails)[which(tabulatedEmails > 1)] + IDXs_to_remove <- + unlist(lapply(1:length(duplicatedEmails), function(x) + head( + which(resultsTidy$Email == duplicatedEmails[x]),-1 + ))) + resultsTidy <- resultsTidy[-IDXs_to_remove, ] +} + +nrow(resultsTidy) +``` + +
+ +## Identify type of user + +The first question of the poll asks respondents to describe their current usage of the AnVIL and allows us to categorize respondents as potential or current users of the AnVIL. + +
Question and possible answers + +> How would you describe your current usage the AnVIL platform? + +Possible answers include: + +* For completed/long-term projects (e.g., occasional updates/maintenance as needed) +* For ongoing projects (e.g., consistent project development and/or work) +* For short-term projects (e.g., short, intense bursts separated by a few months) +* I do no currently use the AnVIL, but have in the past +* I have never heard of the AnVIL +* I have never used the AnVIL, but have heard of it. + +The first three possible answers represent current or returning AnVIL users. The last three possible answers represent potential AnVIL users. + +
+ +
Identifying user type details + +
Description of variable definitions and steps + +We use `case_when` to evaluate the response in the `CurrentUsageDescription` column and assign a corresponding, simplified label of "Current User" or "Potential User'. In other words we translate the given response to a user label. Using the `case_when` as the internal nested function of the `mutate` function, means that the translation is then saved in a new column, `UserType`. + +
+ +```{r} +resultsTidy %<>% + mutate( + UserType = case_when( + CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "Current User", + CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "Current User", + CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "Current User", + CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "Potential User", + CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "Potential User", + CurrentUsageDescription == "I have never heard of the AnVIL" ~ "Potential User" + ) + ) %>% + mutate(UserType = factor(UserType, levels = c("Potential User", "Current User"))) +``` + +
+ +## Institutional Affiliation: Synchronize Institution Names + +Users were able to disclose their institutional affiliation using a free text response, therefore we needed to synchronize institution names (example: Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses) and added simplified affiliation categories ([R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown] and [Research Intensive, Education Focused, and Industry & Other]). The first level of affiliation categories are notated in an institution specific codebook (`data/institution_codebook.txt`) + +
Question and possible answers + +> What institution are you affiliated with? + +Free response for answers + +
+ +
Institutional affiliation syncronizations details + +This synchronization corrects for the various spellings and capitalizations used for the same institution (ex, Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses). + +
Description of variable definitions and steps + +We use a `recode()` within a `mutate()` to synchronize the institutional affiliations as necessary + +
+ + +```{r} +resultsTidy %<>% + mutate( + InstitutionalAffiliation = + recode( + InstitutionalAffiliation, + "Broad" = "Broad Institute", + "broad institute" = "Broad Institute", + "CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" = "City University of New York", + "harvard" = "Harvard University", + "Harvard Public Health" = "Harvard University", + "Johns hopkins" = "Johns Hopkins", + "Johns Hopkins University" = "Johns Hopkins", + "OHSU" = "Oregon Health & Science University", + "OHSU (Knight Center)" = "Oregon Health & Science University", + "The Ohio State University" = "Ohio State University", + "UCSC" = "University of California Santa Cruz", + "univ. ca. santa cruz" = "University of California Santa Cruz", + "university of California santa cruz" = "University of California Santa Cruz", + "UMASS Chan Medical School" = "UMass Chan Medical School", + "Umass Chan Medical School" = "UMass Chan Medical School", + "Washington University in St Louis" = "Washington University in St. Louis", + "yikongene" = "Yikon Genomics", + "v" = "Unknown" + ) + ) +``` + +Elizabeth Humphries grouped institutional affiliations into a limited set of categories: R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown and we notated those groupings/labels within the `institution_codebook.txt` data file, . Grouping into limited institutional affiliation categories allows us to consolidate free answers for easier data visualization and identification of trends. + +
Description of variable definitions and steps + +We use a `read_delim()` to read in the institution_codebook file, and select just the `InstitutionalAffiliation` and `InstitutionalType` columns (ignoring the column that specifies how institutions were entered by survey respondents). We then use a full_join by the `InstitutionalAffiliation` column to add an `InstitutionalType` column such that the category labels are now included as a new column, joining the appropriate values dependent upon the `InstitutionalAffiliation` column. + +
+ +```{r, message = FALSE} +institutionCodeBook <- read_delim(here("data/institution_codebook.txt"), delim="\t", col_select = c(InstitutionalAffiliation, InstitutionalType)) + +resultsTidy <- full_join(resultsTidy, institutionCodeBook, by = "InstitutionalAffiliation") +``` + +Here we even further simplify Institutional Affiliations to focus on Research Intensive, Education Focused, and Industry & Other + +This groups R1 University, Research Center, Medical Center or School, and NIH as "Research Intensive"; R2 University & Community College as "Education Focused"; and Industry, International Location, or Unknown as "Industry & Other". + +```{r} +resultsTidy %<>% + mutate(FurtherSimplifiedInstitutionalType = + case_when( + InstitutionalType == "R1 University" ~ "Research Intensive", + InstitutionalType == "Research Center" ~ "Research Intensive", + InstitutionalType == "Medical Center or School" ~ "Research Intensive", + InstitutionalType == "NIH" ~ "Research Intensive", + InstitutionalType == "R2 University" ~ "Education Focused", + InstitutionalType == "Community College" ~ "Education Focused", + InstitutionalType == "Industry" ~ "Industry & Other", + InstitutionalType == "International Location" ~ "Industry & Other", + InstitutionalType == "Unknown" ~ "Industry & Other" + ) + ) +``` + +
+ +## Highest degree attained + +This question allowed more than one response, however, only one response selected two (PhD, MD), which we recoded to be MD/PhD. We simplify the possible responses to group attained or in progress degrees + + +
Question and possible answers + +> What is the highest degree you have attained? + +Possible answers include (and multiple choices could be selected and would be comma separated if so) + +* High school or equivalent +* Bachelor's degree +* Master's degree in progress +* Master's degree +* PhD in progress +* PhD +* MD in progress +* MD +* Other (with free text entry) + +
+ +
Degree recoding details + +
Description of variable definitions and steps + +Because multiple responses could be selected and those would be comma separated and because free text response was possible if other was selected, we need to tidy the data from this question. From visual inspection of the data, I see that the only time multiple responses were selected were for MD/PhD. No other's were selected. So we'll just recode "PhD, MD" to be "MD/PhD" + +Let's also set the factor levels to follow the general progress of degrees + +
+ + +```{r} +resultsTidy %<>% + mutate( + Degrees = + factor(recode(Degrees, "PhD, MD" = "MD/PhD"), levels = c("High School or equivalent", "Bachelor's degree", "Master's degree in progress", "Master's degree", "PhD in progress", "PhD", "MD in progress", "MD", "MD/PhD")), + FurtherSimplifiedDegrees = recode(Degrees, + "Master's degree in progress" = "Master's degree (or in progress)", + "Master's degree" = "Master's degree (or in progress)", + "PhD in progress" = "PhD (or in progress)", + "PhD" = "PhD (or in progress)", + "MD/PhD" = "MD (MD, MD/PhD, or in progress)", + "MD in progress" = "MD (MD, MD/PhD, or in progress)", + "MD" = "MD (MD, MD/PhD, or in progress)" + ) + ) +``` + +
+ +## Tool Knowledge and Comfort Separate from the AnVIL and on the AnVIL + +We want to recode these responses to set the factor level/progression from Don't know it, not at all comfortable, all the way to extremely comfortable and make corresponding integer comfort scores. + +
Question and possible answers + +>How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? +>How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? +>How would you rate your knowledge of or comfort with these AnVIL data features? +Shared technologies between these two questions include + +* Jupyter Notebooks: `CurrentAnVILTechJupyterNotebooks` & `AllTechJupyterNotebooks` +* Bioconductor & RStudio: `CurrentAnVILTechRStudio` & `AllTechRStudio` + `AllTechBioconductor` +* Galaxy: `CurrentAnVILTechGalaxy` & `AllTechGalaxy` +* WDL Workflows / Workflows (e.g., WDL): `CurrentAnVILTechWDL` & `AllTechWorkflows` +* Containers: `CurrentAnVILTechContainers` & `AllTechContainers` +* Unix / Command Line: `CurrentAnVILTechCommandLine` & `AllTechCommandLine` + +Technologies only asked separate from the AnVIL + +* Python: `AllTechPython` +* R: `AllTechR` + +Technologies/data features only asked with regards to the AnVIL + +* Accessing controlled access datasets: `CurrentAnVILTechAccessData` +* DUOS (Data Use Oversight System): `CurrentAnVILTechDUOS` +* Terra on AnVIL (Workspaces): `CurrentAnVILTechTerra` +* TDR (Terra Data Repository): `CurrentAnVILTechTDR` + +Possible answers for each of these questions include + +* Don't know it (0) +* Not at all comfortable (1) +* Slightly comfortable (2) +* Somewhat comfortable (3) +* Moderately comfortable (4) +* Extremely comfortable (5) + +Notated possible "comfort scores" in parentheses next to each possible answer. We'll add these as additional columns that now start with the word "Score_" but otherwise retain the column name, in case it's helpful to still have the words (whose factor level we'll set to reflect the progression of knowledge/comfort). + +Responses are NA if the question wasn't asked to the survey taker (e.g., they were a potential user and weren't asked about technologies with regards to the AnVIL) + +
+ +
Cleaning Comfort level/scores for various technologies and resources details + +It's likely that someone who's a program administrator will select don't know for these.... should we remove them and see how average scores change? + +
Description of variable definitions and steps + +We select the relevant columns (those that start with "CurrentAnVILTech" or "AllTech") we want to work with. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined. + +After the `unnest` we can use the `mutate` function to first work with these as factors (to set the progression we want from don't know it all the way to extremely comfortable) and then to make the replacements specified above for an integer score in place of the comfort level, placing these scores in new columns with names that begin with "Score_" and fill in the rest of the column name with the corresponding original column name. + +
+ +```{r} +resultsTidy %<>% + mutate(across(starts_with(c( + "CurrentAnVILTech", "AllTech" + )), as.character)) %>% + unnest(starts_with(c("CurrentAnVILTech", "AllTech")), keep_empty = TRUE) %>% + mutate(across(starts_with(c( + "CurrentAnVILTech", "AllTech" + )), ~ parse_factor( + ., + levels = c( + "Don't know it", + "Not at all comfortable", + "Slightly comfortable", + "Somewhat comfortable", + "Moderately comfortable", + "Extremely comfortable" + ) + ))) %>% + mutate(across( + starts_with(c("CurrentAnVILTech", "ALLTech")), + ~ case_when( + . == "Don't know it" ~ 0, + . == "Not at all comfortable" ~ 1, + . == "Slightly comfortable" ~ 2, + . == "Somewhat comfortable" ~ 3, + . == "Moderately comfortable" ~ 4, + . == "Extremely comfortable" ~ 5 + ) + , + .names = "Score_{.col}" + )) +``` + +
+ +## Feature importance: Comparisons of rank of importance of features/resources between Current Users and Potential Users + +We want to recode these responses to remove labels and make them integers. + +
Question and possible answers + +>Rank the following features or resources according to their importance for your continued use of the AnVIL + +>Rank the following features or resources according to their importance to you as a potential user of the AnVIL? + +* Easy billing setup +* Flat-rate billing rather than use-based +* Free version with limited compute or storage +* On demand support and documentation +* Specific tools or datasets are available/supported +* Greater adoption of the AnVIL by the scientific community + +We're going to look at a comparison of the assigned ranks for these features, comparing between current users and potential users. + +
+ +
Cleaning/recoding the feature importance ranks details + +
Description of variable definitions and steps + +We can use `starts_with` to select these columns, specifically focusing on the starts with "PotentialRank" and "CurrentRank". When we made simplified names for the columns, these are the only twelve that start like that. + +Either the 6 CurrentRank or the 6 PotentialRank were asked to each survey taker which means that we expect NULL values in these columns since not every survey taker will have answered all of these questions. + +We want to recode the following values + +* Replace 1 (Most important in this list) with 1 +* Replace 6 (Least important in this list) with 6 + +Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined. + +After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_` + +
+ +```{r} +resultsTidy %<>% + mutate(across(starts_with(c( + "PotentialRank", "CurrentRank" + )), as.character)) %>% + unnest(starts_with(c("PotentialRank", "CurrentRank")), keep_empty = TRUE) %>% + mutate(across( + starts_with(c("PotentialRank", "CurrentRank")), + ~ recode( + .x, + "1 (Most important in this list)" = "1", + "6 (Least important in this list)" = "6", + "NULL" = NA_character_ + ) + )) %>% + mutate(across(starts_with(c( + "PotentialRank", "CurrentRank" + )), as.integer)) +``` + +
+ +## Training Modality Preference + +We want to recode these responses to remove labels and make them integers. + +
Question and possible answers + +>Please rank how/where you would prefer to attend AnVIL training workshops. + +Possible answers include + +* On-site at my institution: `AnVILTrainingWorkshopsOnSite` +* Virtual: `AnVILTrainingWorkshopsVirtual` +* Conference (e.g., CSHL, AMIA): `AnVILTrainingWorkshopsConference` +* AnVIL-specific event: `AnVILTrainingWorkshopsSpecEvent` +* Other: `AnVILTrainingWorkshopsOther` + +The responses are stored in the starts with `AnVILTrainingWorkshops` columns + +
+ +
Cleaning the training modality ranks details + +
Description of variable definitions and steps + +We can use `starts_with` to select these columns, specifically focusing on the starts with "AnVILTrainingWorkshops". These are the only 5 that start like that when we made simplified column names. + +We want to recode the following values + +* Replace 1 (Most preferred in this list) with 1 +* Replace 5 (Least preferred in this list) with 5 + +Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves any NULL values, but first we have to use an `as.character` type change before the `unnest`, otherwise, we get an error that double and character values can't be combined. + +After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_` + +
+ +```{r} + +resultsTidy %<>% + mutate(across(starts_with( + "AnVILTrainingWorkshops"), as.character)) %>% + unnest(starts_with("AnVILTrainingWorkshops"), keep_empty = TRUE) %>% + mutate(across( + starts_with("AnVILTrainingWorkshops"), + ~ recode( + .x, + "1 (Most preferred in this list)" = "1", + "5 (Least preferred in this list)" = "5", + "NULL" = NA_character_ + ) + )) %>% + mutate(across(starts_with("AnVILTrainingWorkshop"), as.integer)) + +``` + +
+ +## Simplified experience status for various research categories (clinical, human genomics, non-human genomics) + +Want to add three columns that act as flags reporting if the respondent is + +* experienced with clinical research, specifically either moderately or extremely experienced in working with human clinical data +* experienced with human genomics research, specifically is moderately or extremely experienced in working with human genomics data +* experienced with non-human genomics research expert, specifically is moderately or extremely experienced in working with non-human genomics data + +We will use this information later to subset responses when considering popular tools or datasets. + +
Question and possible answers + +>How much experience do you have analyzing the following data categories? + +The three research categories people are asked about include + +* Human Genomic +* Non-human Genomic +* Human Clinical + +Possible answers include + +* Not at all experienced +* Slightly experienced +* Somewhat experienced +* Moderately experienced +* Extremely experienced. + +
+ +
Setting research category experience flag details + +
Description of variable definitions and steps + +We use a `mutate` together with 3 `case_when`'s. + +* If the `HumanClinicalExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human clinical research expert in the `clinicalFlag` column (`TRUE`). Otherwise, we mark a `FALSE` to signify they are not a clinical research expert. +* If the `HumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human genomic research expert in the `humanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert. +* If the `NonHumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a non-human genomic research expert in the `nonHumanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert. + +
+ +```{r} +resultsTidy %<>% + mutate( + clinicalFlag = case_when( + HumanClinicalExperience == "Moderately experienced" | HumanClinicalExperience == "Extremely experienced" ~ TRUE, + .default = FALSE + ), + humanGenomicFlag = case_when( + HumanGenomicExperience == "Moderately experienced" | HumanGenomicExperience == "Extremely experienced" ~ TRUE, + .default = FALSE + ), + nonHumanGenomicFlag = case_when(NonHumanGenomicExperience == "Moderately experienced" | NonHumanGenomicExperience == "Extremely experienced" ~ TRUE, + .default = FALSE) + ) +``` + +
+ +## AnVIL Demo Attendance, Awareness, and Utilization + +The question asked was pretty granular in describing attendance, use, and awareness of AnVIL Demos. We we want to simplify each possible answer to a binary version of aware of/not aware of or used/have not used. + +
Question and possible answers + +> Have you attended a monthly AnVIL Demo? + +Possible answers include + +* Yes, multiple +* Yes, one +* Not yet, but am registered to +* No, but aware of +* No, didn't know of + +
+ +
AnVIL Demo recoding details + +
Description of variable definitions and steps + +
+ +```{r, message = FALSE} +resultsTidy %<>% + mutate(AnVILDemo = factor(AnVILDemo, levels = c("Yes, multiple", "Yes, one", "Not yet, but am registered to", "No, but aware of", "No, didn't know of")), + AnVILDemoAwareness = factor(case_when( + AnVILDemo == "Yes, multiple" ~ "Aware of", + AnVILDemo == "Yes, one" ~ "Aware of", + AnVILDemo == "Not yet, but am registered to" ~ "Aware of", + AnVILDemo == "No, but aware of" ~ "Aware of", + AnVILDemo == "No, didn't know of" ~ "Not Aware of" + ), levels = c("Not Aware of", "Aware of")), + AnVILDemoUse = factor(case_when( + AnVILDemo == "Yes, multiple" ~ "Have/will utilize", + AnVILDemo == "Yes, one" ~ "Have/will utilize", + AnVILDemo == "Not yet, but am registered to" ~ "Have/will utilize", + AnVILDemo == "No, but aware of" ~ "Have not utilized", + AnVILDemo == "No, didn't know of" ~ "Have not utilized" + ), levels = c("Have not utilized", "Have/will utilize")) +) + +``` + +
diff --git a/_site.yml b/_site.yml deleted file mode 100644 index 7f07604..0000000 --- a/_site.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: OTTR Template Website -output_dir: 'docs' -navbar: - title: OTTR Web - left: - - text: "" - href: index.html - icon: fa-home - - text: 1. Setup - href: setup.html - - text: 2. Hosting - href: hosting.html - - text: Editing - href: editing.html - - text: 3. Style - href: style.html - - text: 4. Git Actions - href: git_actions.html - - text: More ottr docs - href: https://www.ottrproject.org/ - - -output: - html_document: - theme: cosmo - lib_dir: site_libs - self_contained: no - highlight: textmate - css: styles.css - includes: - in_header: resources/header.html diff --git a/OTTR_Template_Website.Rproj b/anvilPoll2024.Rproj similarity index 75% rename from OTTR_Template_Website.Rproj rename to anvilPoll2024.Rproj index 628359e..8e3c2eb 100644 --- a/OTTR_Template_Website.Rproj +++ b/anvilPoll2024.Rproj @@ -9,9 +9,5 @@ UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 -RnwWeave: knitr +RnwWeave: Sweave LaTeX: pdfLaTeX - -AutoAppendNewline: Yes - -BuildType: Website diff --git a/anvilPoll2024MainAnalysis.Rmd b/anvilPoll2024MainAnalysis.Rmd new file mode 100644 index 0000000..c44a507 --- /dev/null +++ b/anvilPoll2024MainAnalysis.Rmd @@ -0,0 +1,1296 @@ +--- +title: "State of the AnVIL 2024" +subtitle: "Main analysis" +author: "Kate Isaac, Elizabeth Humphries, & Ava Hoffman" +date: "`r Sys.Date()`" +output: html_document +--- + +```{r, message=FALSE} +library(googlesheets4) +library(tidyverse) +library(magrittr) #for %<>% +library(here) +library(grid) #for Grobs and unit() +``` + +# Read in data + +Data were read in via a Google Sheet on the AnVIL Team Drive. + +
Import details +The google sheet we are reading in is stored in an AnVIL Google drive folder `State of the AnVIL 2024`. Its permissions are restricted such that only people with access can open with the link. Using `gs4_auth()` to authorize my google account before running this code, I needed to change the `scopes` argument, specifically `scopes=spreadsheets.readonly` was necessary. + +In this google sheet, each question is a column, and each response to the survey is a row. If the respondant wasn't asked or didn't answer a specific question, there is an NA in the corresponding row/column. + +```{r, echo=FALSE, message=FALSE} +gs4_auth(email = "kathryn.j.isaac@gmail.com", scopes="spreadsheets.readonly") +resultsRaw <- + googlesheets4::read_sheet( + "https://docs.google.com/spreadsheets/d/1wDMNC6BD2AaIwh_GOkPTpl1tvAyLwVBQgAvOD2rYrX0/edit?usp=sharing", + na = c("NA", "na", "")) +``` + +
+ +# Clean data + +**Note:** Every code block in this section edits the `resultsTidy` data frame and should be run before plotting within the `# Insights` section below. Subsections are marked according to which Insight they are related to, but cleaning steps like identifying the user type are important for most every plot. + +## Set Column Names + +We set the column names to simplified column names (e.g., that help us select related columns for various analyses) by reading in a codebook (`data/codebook.txt`). + +
Simplifying column names details + +
Description of variable definitions and steps + +We have a codebook that is a tab delimited file and has 4 columns, and each row represents a question in the survey. The first column lists a/the question from the survey (`SurveyColNames`); the second column lists a corresponding simplified column name for that survey question (`SimplifedColNames`); the third column describes the variable format (`VariableFormat`), e.g, is it a double, or a character; the fourth column gives a lengthier description of the question (`Description`), e.g., who was asked it, what possible answers are, etc. + +This code block reads in that codebook and specifically selects the `SimplifiedColNames` column. It then renames the column names of the raw results from the google sheet (where each question is a column) with these simplified column names. + +
+ +```{r, message=FALSE} +simplifiedColNames <- + read_delim(here("data/codebook.txt"), + delim = "\t", + col_select = SimplifiedColNames) +resultsTidy <- + resultsRaw %>% `colnames<-`(unlist(simplifiedColNames)) +``` +
+ +## Keep last response if duplicated according to email (if email provided) + +Choosing to select the last response because the respondent may have spent more time thinking about how they wanted to respond after their initial response. + +
Filtering duplicated responses details + +
Description of variable definitions and steps + +* The `table` function tabulates the number of occurrences, and we tell it to ignore literal NAs. Because providing an email was optional, we expect many NA responses. The `table` function, by ignoring NAs, will return the unique emails and the number of times each email was used. We store the tabulated results in the variable `tabulatedEmails` +* Using the `sum` function, we look to see how many emails/responses are provided more than once. `tabulatedEmails > 1` is returning a vector of TRUEs and FALSEs where TRUE means that there was more than one instance/count of a given email and FALSE means there wasn't. The `sum` function in essence counts the number of TRUEs and if the `sum` is greater than 0, that means there is at least one duplicated email whose count is greater than 1. +* `duplicatedEmails` reports which emails are duplicated by using the tabulated/table of emails. First it identifies which emails were observed more than once, using the `which` function, and uses the indices returned from that to index the `names` of the tabulated emails, grabbing the specific emails. +* We want to know which entries from the overall survey responses to remove for each duplicated email. Ideally, we want to remove the responses all at the same time or go backwards removing one at a time, because we don't want to affect downstream indices. The approach here, keeps track of all the indices of interest and removed them at the same time. + * Therefore, we'll use `lapply` to loop through the duplicated emails (`duplicatedEmails`) and grab the index for survey responses associated with that email address (`which(resultsTidy$Email == duplicatedEmails[x])`). + * However, we want to keep the last survey response for each duplicated email. Therefore, we wrap that `which` function in `head(_,-1 )` function so that it grabs all indices except the last one. + * Finally, we `unlist` the indices so that there's a single vector associated with indices for any duplicated email responses to be removed `IDXs_to_remove`. And since we want to remove them all at the same time, we subset `resultsTidy`, grabbing every row except those in `IDXs_to_remove`, as denoted by the `-`. + +
+ +```{r} + +tabulatedEmails <- table(resultsTidy$Email, useNA = "no") + +if (sum(tabulatedEmails > 1) > 0) { + duplicatedEmails <- + names(tabulatedEmails)[which(tabulatedEmails > 1)] + IDXs_to_remove <- + unlist(lapply(1:length(duplicatedEmails), function(x) + head( + which(resultsTidy$Email == duplicatedEmails[x]),-1 + ))) + resultsTidy <- resultsTidy[-IDXs_to_remove, ] +} + +nrow(resultsTidy) +``` + +
+ +## Identify type of user + +The first question of the poll asks respondents to describe their current usage of the AnVIL and allows us to categorize respondents as potential or current users of the AnVIL. + +
Question and possible answers + +> How would you describe your current usage the AnVIL platform? + +Possible answers include: + +* For completed/long-term projects (e.g., occasional updates/maintenance as needed) +* For ongoing projects (e.g., consistent project development and/or work) +* For short-term projects (e.g., short, intense bursts separated by a few months) +* I do no currently use the AnVIL, but have in the past +* I have never heard of the AnVIL +* I have never used the AnVIL, but have heard of it. + +The first three possible answers represent current or returning AnVIL users. The last three possible answers represent potential AnVIL users. + +
+ +
Identifying user type details + +
Description of variable definitions and steps + +We use `case_when` to evaluate the response in the `CurrentUsageDescription` column and assign a corresponding, simplified label of "CurrentUser" or "PotentialUser'. In other words we translate the given response to a user label. Using the `case_when` as the internal nested function of the `mutate` function, means that the translation is then saved in a new column, `UserType`. + +
+ +```{r} +resultsTidy %<>% + mutate( + UserType = case_when( + CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "CurrentUser", + CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "CurrentUser", + CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "CurrentUser", + CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "PotentialUser", + CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "PotentialUser", + CurrentUsageDescription == "I have never heard of the AnVIL" ~ "PotentialUser" + ) + ) %>% + mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser"))) +``` + +
+ +## Institutional Affiliation: Synchronize Institution Names + +Users were able to disclose their institutional affiliation using a free text response, therefore we needed to synchronize institution names (example: Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses) and added simplified affiliation categories ([R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown] and [Research Intensive, Education Focused, and Industry & Other]). The first level of affiliation categories are notated in an institution specific codebook (`data/institution_codebook.txt`) + +
Question and possible answers + +> What institution are you affiliated with? + +Free response for answers + +
+ +
Institutional affiliation syncronizations details + +This synchronization corrects for the various spellings and capitalizations used for the same institution (ex, Johns Hopkins and Johns Hopkins University refer to the same institution, despite the difference in the free responses). + +
Description of variable definitions and steps + +We use a `recode()` within a `mutate()` to synchronize the institutional affiliations as necessary + +
+ + +```{r} +resultsTidy %<>% + mutate( + InstitutionalAffiliation = + recode( + InstitutionalAffiliation, + "Broad" = "Broad Institute", + "broad institute" = "Broad Institute", + "CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" = "City University of New York", + "harvard" = "Harvard University", + "Harvard Public Health" = "Harvard University", + "Johns hopkins" = "Johns Hopkins", + "Johns Hopkins University" = "Johns Hopkins", + "OHSU" = "Oregon Health & Science University", + "OHSU (Knight Center)" = "Oregon Health & Science University", + "The Ohio State University" = "Ohio State University", + "UCSC" = "University of California Santa Cruz", + "univ. ca. santa cruz" = "University of California Santa Cruz", + "university of California santa cruz" = "University of California Santa Cruz", + "UMASS Chan Medical School" = "UMass Chan Medical School", + "Umass Chan Medical School" = "UMass Chan Medical School", + "Washington University in St Louis" = "Washington University in St. Louis", + "yikongene" = "Yikon Genomics", + "v" = "Unknown" + ) + ) +``` + +Elizabeth Humphries grouped institutional affiliations into a limited set of categories: R1 University, R2 University, Community College, Medical Center or School, International Location, Research Center, NIH, Industry, Unknown and we notated those groupings/labels within the `institution_codebook.txt` data file, . Grouping into limited institutional affiliation categories allows us to consolidate free answers for easier data visualization and identification of trends. + +
Description of variable definitions and steps + +We use a `read_delim()` to read in the institution_codebook file, and select just the `InstitutionalAffiliation` and `InstitutionalType` columns (ignoring the column that specifies how institutions were entered by survey respondents). We then use a full_join by the `InstitutionalAffiliation` column to add an `InstitutionalType` column such that the category labels are now included as a new column, joining the appropriate values dependent upon the `InstitutionalAffiliation` column. + +
+ +```{r, message = FALSE} +institutionCodeBook <- read_delim(here("data/institution_codebook.txt"), delim="\t", col_select = c(InstitutionalAffiliation, InstitutionalType)) + +resultsTidy <- full_join(resultsTidy, institutionCodeBook, by = "InstitutionalAffiliation") +``` + +Here we even further simplify Institutional Affiliations to focus on Research Intensive, Education Focused, and Industry & Other + +This groups R1 University, Research Center, Medical Center or School, and NIH as "Research Intensive"; R2 University & Community College as "Education Focused"; and Industry, International Location, or Unknown as "Industry & Other". + +```{r} +resultsTidy %<>% + mutate(FurtherSimplifiedInstitutionalType = + case_when( + InstitutionalType == "R1 University" ~ "Research Intensive", + InstitutionalType == "Research Center" ~ "Research Intensive", + InstitutionalType == "Medical Center or School" ~ "Research Intensive", + InstitutionalType == "NIH" ~ "Research Intensive", + InstitutionalType == "R2 University" ~ "Education Focused", + InstitutionalType == "Community College" ~ "Education Focused", + InstitutionalType == "Industry" ~ "Industry & Other", + InstitutionalType == "International Location" ~ "Industry & Other", + InstitutionalType == "Unknown" ~ "Industry & Other" + ) + ) +``` + +
+ +## Highest degree attained + +This question allowed more than one response, however, only one response selected two (PhD, MD), which we recoded to be MD/PhD. We simplify the possible responses to group attained or in progress degrees + + +
Question and possible answers + +> What is the highest degree you have attained? + +Possible answers include (and multiple choices could be selected and would be comma separated if so) + +* High school or equivalent +* Bachelor's degree +* Master's degree in progress +* Master's degree +* PhD in progress +* PhD +* MD in progress +* MD +* Other (with free text entry) + +
+ +
Degree recoding details + +
Description of variable definitions and steps + +Because multiple responses could be selected and those would be comma separated and because free text response was possible if other was selected, we need to tidy the data from this question. From visual inspection of the data, I see that the only time multiple responses were selected were for MD/PhD. No other's were selected. So we'll just recode "PhD, MD" to be "MD/PhD" + +Let's also set the factor levels to follow the general progress of degrees + +
+ + +```{r} +resultsTidy %<>% + mutate( + Degrees = + factor(recode(Degrees, "PhD, MD" = "MD/PhD"), levels = c("High School or equivalent", "Bachelor's degree", "Master's degree in progress", "Master's degree", "PhD in progress", "PhD", "MD in progress", "MD", "MD/PhD")), + FurtherSimplifiedDegrees = recode(Degrees, + "Master's degree in progress" = "Master's degree (or in progress)", + "Master's degree" = "Master's degree (or in progress)", + "PhD in progress" = "PhD (or in progress)", + "PhD" = "PhD (or in progress)", + "MD/PhD" = "MD (MD, MD/PhD, or in progress)", + "MD in progress" = "MD (MD, MD/PhD, or in progress)", + "MD" = "MD (MD, MD/PhD, or in progress)" + ) + ) +``` + +
+ +## Tool Knowledge and Comfort Separate from the AnVIL and on the AnVIL + +We want to recode these responses to set the factor level/progression from Don't know it, not at all comfortable, all the way to extremely comfortable and make corresponding integer comfort scores. + +
Question and possible answers + +>How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? +>How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? +>How would you rate your knowledge of or comfort with these AnVIL data features? +Shared technologies between these two questions include + +* Jupyter Notebooks: `CurrentAnVILTechJupyterNotebooks` & `AllTechJupyterNotebooks` +* Bioconductor & RStudio: `CurrentAnVILTechRStudio` & `AllTechRStudio` + `AllTechBioconductor` +* Galaxy: `CurrentAnVILTechGalaxy` & `AllTechGalaxy` +* WDL Workflows / Workflows (e.g., WDL): `CurrentAnVILTechWDL` & `AllTechWorkflows` +* Containers: `CurrentAnVILTechContainers` & `AllTechContainers` +* Unix / Command Line: `CurrentAnVILTechCommandLine` & `AllTechCommandLine` + +Technologies only asked separate from the AnVIL + +* Python: `AllTechPython` +* R: `AllTechR` + +Technologies/data features only asked with regards to the AnVIL + +* Accessing controlled access datasets: `CurrentAnVILTechAccessData` +* DUOS (Data Use Oversight System): `CurrentAnVILTechDUOS` +* Terra on AnVIL (Workspaces): `CurrentAnVILTechTerra` +* TDR (Terra Data Repository): `CurrentAnVILTechTDR` + +Possible answers for each of these questions include + +* Don't know it (0) +* Not at all comfortable (1) +* Slightly comfortable (2) +* Somewhat comfortable (3) +* Moderately comfortable (4) +* Extremely comfortable (5) + +Notated possible "comfort scores" in parentheses next to each possible answer. We'll add these as additional columns that now start with the word "Score_" but otherwise retain the column name, in case it's helpful to still have the words (whose factor level we'll set to reflect the progression of knowledge/comfort). + +Responses are NA if the question wasn't asked to the survey taker (e.g., they were a potential user and weren't asked about technologies with regards to the AnVIL) + +
+ +
Cleaning Comfort level/scores for various technologies and resources details + +It's likely that someone who's a program administrator will select don't know for these.... should we remove them and see how average scores change? + +
Description of variable definitions and steps + +We select the relevant columns (those that start with "CurrentAnVILTech" or "AllTech") we want to work with. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined. + +After the `unnest` we can use the `mutate` function to first work with these as factors (to set the progression we want from don't know it all the way to extremely comfortable) and then to make the replacements specified above for an integer score in place of the comfort level, placing these scores in new columns with names that begin with "Score_" and fill in the rest of the column name with the corresponding original column name. + +
+ +```{r} +resultsTidy %<>% + mutate(across(starts_with(c( + "CurrentAnVILTech", "AllTech" + )), as.character)) %>% + unnest(starts_with(c("CurrentAnVILTech", "AllTech")), keep_empty = TRUE) %>% + mutate(across(starts_with(c( + "CurrentAnVILTech", "AllTech" + )), ~ parse_factor( + ., + levels = c( + "Don't know it", + "Not at all comfortable", + "Slightly comfortable", + "Somewhat comfortable", + "Moderately comfortable", + "Extremely comfortable" + ) + ))) %>% + mutate(across( + starts_with(c("CurrentAnVILTech", "ALLTech")), + ~ case_when( + . == "Don't know it" ~ 0, + . == "Not at all comfortable" ~ 1, + . == "Slightly comfortable" ~ 2, + . == "Somewhat comfortable" ~ 3, + . == "Moderately comfortable" ~ 4, + . == "Extremely comfortable" ~ 5 + ) + , + .names = "Score_{.col}" + )) +``` + +
+ +## Feature importance: Comparisons of rank of importance of features/resources between Current Users and Potential Users + +We want to recode these responses to remove labels and make them integers. + +
Question and possible answers + +>Rank the following features or resources according to their importance for your continued use of the AnVIL + +>Rank the following features or resources according to their importance to you as a potential user of the AnVIL? + +* Easy billing setup +* Flat-rate billing rather than use-based +* Free version with limited compute or storage +* On demand support and documentation +* Specific tools or datasets are available/supported +* Greater adoption of the AnVIL by the scientific community + +We're going to look at a comparison of the assigned ranks for these features, comparing between current users and potential users. + +### Recode rank values + +
Description of variable definitions and steps + +Columns of interest include + +* PotentialRankEasyBillingSetup +* PotentialRankFlatRateBilling +* PotentialRankFreeVersion +* PotentialRankSupportDocs +* PotentialRankToolsData +* PotentialRankCommunityAdoption +* CurrentRankEasyBillingSetup +* CurrentRankFlatRateBilling +* CurrentRankFreeVersion +* CurrentRankSupportDocs +* CurrentRankToolsData +* CurrentRankCommunityAdoption + +
+ +
Cleaning the feature importance ranks details + +
Description of variable definitions and steps + +We can use `starts_with` to select these columns, specifically focusing on the starts with "PotentialRank" and "CurrentRank". When we made simplified names for the columns, these are the only twelve that start like that. + +Either the 6 CurrentRank or the 6 PotentialRank were asked to each survey taker which means that we expect NULL values in these columns since not every survey taker will have answered all of these questions. + +We want to recode the following values + +* Replace 1 (Most important in this list) with 1 +* Replace 6 (Least important in this list) with 6 + +Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. The non-tidyverse way of doing this would be `unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup))`. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves the NULL values. Notice in the non-tidyverse way, we had to use `as.character` in order to preserve the null values. In the tidyverse way, we still have to use an as.character type change before the `unnest`, otherwise, we get an error that double and character values can't be combined. + +After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_` + +
+ +```{r} +resultsTidy %<>% + mutate(across(starts_with(c( + "PotentialRank", "CurrentRank" + )), as.character)) %>% + unnest(starts_with(c("PotentialRank", "CurrentRank")), keep_empty = TRUE) %>% + mutate(across( + starts_with(c("PotentialRank", "CurrentRank")), + ~ recode( + .x, + "1 (Most important in this list)" = "1", + "6 (Least important in this list)" = "6", + "NULL" = NA_character_ + ) + )) %>% + mutate(across(starts_with(c( + "PotentialRank", "CurrentRank" + )), as.integer)) +``` + +
+ +## Training Modality Preference + +We want to recode these responses to remove labels and make them integers. + +
Question and possible answers + +>Please rank how/where you would prefer to attend AnVIL training workshops. + +Possible answers include + +* On-site at my institution: `AnVILTrainingWorkshopsOnSite` +* Virtual: `AnVILTrainingWorkshopsVirtual` +* Conference (e.g., CSHL, AMIA): `AnVILTrainingWorkshopsConference` +* AnVIL-specific event: `AnVILTrainingWorkshopsSpecEvent` +* Other: `AnVILTrainingWorkshopsOther` + +The responses are stored in the starts with `AnVILTrainingWorkshops` columns + +
+ +
Cleaning the training modality ranks details + +
Description of variable definitions and steps + +We can use `starts_with` to select these columns, specifically focusing on the starts with "AnVILTrainingWorkshops". These are the only 5 that start like that when we made simplified column names. + +We want to recode the following values + +* Replace 1 (Most preferred in this list) with 1 +* Replace 5 (Least preferred in this list) with 5 + +Before we can do that, we first need to change the type of the columns in several ways. We don't want them to be lists. We can use the `unnest` tidyverse function with a `keep_empty = TRUE` argument so that it preserves any NULL values, but first we have to use an `as.character` type change before the `unnest`, otherwise, we get an error that double and character values can't be combined. + +After the `unnest` we can use the `recode` function to make the replacements specified above. And then we go ahead and change the type from character to integer so that we can compute average rank & plot them more easily. There will be a warning that NAs are introduced by coercion when we change the type to integer. So we add a replacement in the `recode`, changing "NULL" to the `NA_character_` + +
+ +```{r} + +resultsTidy %<>% + mutate(across(starts_with( + "AnVILTrainingWorkshops"), as.character)) %>% + unnest(starts_with("AnVILTrainingWorkshops"), keep_empty = TRUE) %>% + mutate(across( + starts_with("AnVILTrainingWorkshops"), + ~ recode( + .x, + "1 (Most preferred in this list)" = "1", + "5 (Least preferred in this list)" = "5", + "NULL" = NA_character_ + ) + )) %>% + mutate(across(starts_with("AnVILTrainingWorkshop"), as.integer)) + +``` + +
+ +## Simplified experience status for various research categories (clinical, human genomics, non-human genomics) + +Want to add three columns that act as flags reporting if the respondent is + +* experienced with clinical research, specifically either moderately or extremely experienced in working with human clinical data +* experienced with human genomics research, specifically is moderately or extremely experienced in working with human genomics data +* experienced with non-human genomics research expert, specifically is moderately or extremely experienced in working with non-human genomics data + +We will use this information later to subset responses when considering popular tools or datasets. + +
Question and possible answers + +>How much experience do you have analyzing the following data categories? + +The three research categories people are asked about include + +* Human Genomic +* Non-human Genomic +* Human Clinical + +Possible answers include + +* Not at all experienced +* Slightly experienced +* Somewhat experienced +* Moderately experienced +* Extremely experienced. + +
+ +
Setting research category experience flag details + +
Description of variable definitions and steps + +We use a `mutate` together with 3 `case_when`'s. + +* If the `HumanClinicalExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human clinical research expert in the `clinicalFlag` column (`TRUE`). Otherwise, we mark a `FALSE` to signify they are not a clinical research expert. +* If the `HumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a human genomic research expert in the `humanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert. +* If the `NonHumanGenomicExperience` column response is "Moderately experienced" or "Extremely experienced", we mark that respondent as a non-human genomic research expert in the `nonHumanGenomicFlag` column (`TRUE`). Otherwise, we again mark a `FALSE` to signify not an expert. + +
+ +```{r} +resultsTidy %<>% + mutate( + clinicalFlag = case_when( + HumanClinicalExperience == "Moderately experienced" | HumanClinicalExperience == "Extremely experienced" ~ TRUE, + .default = FALSE + ), + humanGenomicFlag = case_when( + HumanGenomicExperience == "Moderately experienced" | HumanGenomicExperience == "Extremely experienced" ~ TRUE, + .default = FALSE + ), + nonHumanGenomicFlag = case_when(NonHumanGenomicExperience == "Moderately experienced" | NonHumanGenomicExperience == "Extremely experienced" ~ TRUE, + .default = FALSE) + ) +``` + +
+ +# Insights + +## Identify type of user + +**Takeaway:** Of the ```r nrow(resultsTidy)``` responses, ```r nrow(resultsTidy %>% filter(UserType == "CurrentUser"))``` were current users and ```r nrow(resultsTidy %>% filter(UserType == "PotentialUser"))``` were potential users. The majority of current users belonged to the group who use the AnVIL for ongoing projects while the majority of potential users were evenly split between those who have never used the AnVIL (but have heard of it) and those who used to previously use the AnVIL, but don't currently. + +**Potential Follow-ups:** + +- Look to see if those potential users who previously used to use the AnVIL show similarity in overall trends with the rest of the potential users +- Directly ask why they no longer use the AnVIL (Elizabeth mentioned the possibility that the AnVIL is sometimes used in courses or workshops and students may not use it after that) + +### Prepare and plot the data + +
Description of variable definitions and steps + +First, we group the data by the assigned UserType labels/categories and their related more detailed descriptions. Then we use `summarize` to count the occurrences for each of those categories. We use a mutate statement to better fit the detailed descriptions on the plot. We then send this data to ggplot with the count on the x-axis, and the usage descriptions on the y-axis (ordered by count so highest count is on the top). We fill with the `UserType` description we've assigned. We manually scale the fill to be AnVIL colors and specify we want this to be a stacked bar chart. We then make edits for the theme and labels and finally add a geom_text label for the count next to the bars before we save the plot. + +
+ +```{r, message=FALSE, echo=FALSE} +resultsTidy %>% + group_by(UserType, CurrentUsageDescription) %>% + summarize(count = n()) %>% + mutate(CurrentUsageDescription = case_when( + CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "For ongoing projects:\nconsistent project development\nand/or work", + CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "For completed/long-term projects:\noccasional updates/maintenance\nas needed", + CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "For short-term projects:\nshort, intense bursts\nseparated by a few months", + CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "I do not current use the AnVIL,\nbut have in the past", + CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "I have never\nused the AnVIL", + CurrentUsageDescription == "I have never heard of the AnVIL" ~ "I have never\nheard of the AnVIL" + )) %>% + ggplot(aes(x = count, y = reorder(CurrentUsageDescription, count), fill = UserType)) + + scale_fill_manual(values = c("#E0DD10", "#035C94")) + + geom_bar(stat="identity", position ="stack") + + theme_classic() + + xlab("Count") + + ylab("Current Usage Description") + + ggtitle("How would you describe your current usage\nof the AnVIL platform?") + + geom_text(aes(label = count, group = CurrentUsageDescription), + hjust = -0.5, size=2) + +ggsave(here("plots/respondent_usagedescription.png")) +``` + + +## Demographics: Institutional Affiliation + +**Takeaway:** + +### Prepare and plot the data + +```{r, message=FALSE, echo = FALSE} +resultsTidy %>% + mutate(FurtherSimplifiedInstitutionalType = factor(FurtherSimplifiedInstitutionalType, levels = c("Industry & Other", "Education Focused", "Research Intensive"))) %>% + group_by(UserType, FurtherSimplifiedInstitutionalType) %>% summarize(InstitutionalCount = n()) %>% + ggplot(aes( + y = FurtherSimplifiedInstitutionalType, + x = InstitutionalCount, + fill = UserType + )) + geom_bar(position = "stack", stat = "identity") + + theme_classic() + + geom_text( + aes(label = after_stat(x), group = FurtherSimplifiedInstitutionalType), + stat = 'summary', fun = sum, hjust = -1, size=2 + ) + + ylab("") + + xlab("Count") + + ggtitle("Institutional Affiliation for All Survey Respondents") + + annotation_custom(textGrob("- R1 University \n- Med Campus \n- Research Center\n- NIH ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = 2.65, ymax = 2.65) + + annotation_custom(textGrob("- Industry \n- International Loc\n- Unknown ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = .7, ymax = .7) + + annotation_custom(textGrob("- R2 University \n- Community College", gp=gpar(fontsize=8)),xmin=-8.5,xmax=-8.5,ymin=1.75,ymax=1.75) + + coord_cartesian(clip = "off") + + scale_fill_manual(values = c("#E0DD10", "#035C94")) + + ggtitle("What institution are you affiliated with?") +ggsave(here("plots/institutionalType_simplified_allResponses_colorUserType.png")) +``` + +## Demographics: Highest Degree Attained + +**Takeaway:** + +### Prepare and plot the data + +
Description of variable definitions and steps + +First we use `group_by()` to select`Degrees` and `UserType` in conjunction with `summarize( = n())` to add counts for how many of each combo are observed in the data. + +Then we send this data to ggplot and make a bar chart with the x-axis representing the degrees (`reorder`ed by the count number such that higher counts are first (and the sum) because otherwise the 2 MDs are located after the high school and master's in progress bars (1 each)). The y-axis represents the count, and the fill is used to specify user type (current or potential AnVIL users). We use a stacked bar chart and include labels above each bar of the total sum for that degree type. + +Used [this Stack Overflow post to label sums above the bars](https://stackoverflow.com/questions/30656846/draw-the-sum-value-above-the-stacked-bar-in-ggplot2) + +and used [this Stack Overflow post to remove NA from the legend](https://stackoverflow.com/questions/45493163/ggplot-remove-na-factor-level-in-legend) + +The rest of the changes are related to theme and labels and making sure that the numerical bar labels aren't cut off on the top. + +
+ +```{r, message=FALSE, echo=FALSE} + +resultsTidy %>% + group_by(FurtherSimplifiedDegrees, UserType) %>% + summarize(n = n()) %>% + ggplot(aes(y = reorder(FurtherSimplifiedDegrees, n, sum), + x = n, + fill = UserType + )) + + geom_bar(position = "stack", stat="identity") + + geom_text( + aes(label = after_stat(x), group = FurtherSimplifiedDegrees), + stat = 'summary', fun = sum, hjust = -1, size=2 + ) + + theme_classic() + + ylab("Degree") + + xlab("Count") + + coord_cartesian(clip = "off") + + scale_fill_manual(values = c("#E0DD10", "#035C94"), na.translate = F) + + ggtitle("What is the highest degree you have attained?") + +ggsave(here("plots/degree_furthersimplified_usertype.png")) +``` + +## Experience: Genomics and Clinical Research Experience + +**Takeaway:** + +### Prepare and plot the data + +
Description of variable definitions and steps for preparing the data + +Here we select the columns containing answers for each data category: `HumanGenomicExperience`, `HumanClinicalExperience`, and `NonHumanGenomicExperience`. We also select `UserType` in case we want to split user type out at all in viewing the data. We use a `pivot_longer` to make a long dataframe that can be grouped and groups counted. The category/column names go to a new column, `researchType` and the values in those columns go to a new column `experienceLevel`. Before we use group by and count, we set the factor level on the new `experienceLevel` column to match the progression from not at all experienced to extremely experienced, and we rename the research categories so that the words have spaces, and we say research instead of experience. Then we use `group_by` and `summarize` to add counts for each combination of research category, experience level, and `UserType`. These counts are in the new `n` column. + +
+ +```{r, message=FALSE, echo=FALSE} +experienceDf <- resultsTidy %>% select(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience, UserType) %>% + pivot_longer(c(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience), names_to = "researchType", values_to = "experienceLevel") %>% + mutate(experienceLevel = + factor(experienceLevel, levels = c("Not at all experienced", "Slightly experienced", "Somewhat experienced", "Moderately experienced", "Extremely experienced")), + researchType = case_when(researchType == "HumanClinicalExperience" ~ "Human Clinical Research", + researchType == "HumanGenomicExperience" ~ "Human Genomic Research", + researchType == "NonHumanGenomicExperience" ~ "Non-human\nGenomic Research")) %>% + group_by(researchType, experienceLevel, UserType) %>% summarize(n = n()) +``` + +
Description of variable definitions and steps for plotting the bar graph + +We didn't observe big differences between current and potential users, so we believe this grouped plot is useful for understanding the community as a whole. + +This bar plot has the experience level on the x-axis, the count on the y-axis, and fills the bars according to the experience level (though the fill/color legend is turned off by setting legend.position to none). We facet the research category type and label the bars. We keep a summary stat and sum function and after_stat(y) for the label since the data has splits like `UserType` that we're not visualizing here. + +We adjust various aspects of the theme like turning off the grid and background and rotating the x-tick labels and changing the x- and y-axis labels. We also slightly widen the left axis so that the tick labels aren't cut off. + +
+ +```{r, message=FALSE, echo = FALSE} +ggplot(experienceDf, aes(x=experienceLevel,y=n, fill = experienceLevel)) + + facet_grid(~researchType) + + geom_bar(stat="identity") + + theme_bw() + + theme(panel.background = element_blank(), panel.grid = element_blank()) + + theme(axis.text.x = element_text(angle = 45, hjust=1)) + + geom_text( + aes(label = after_stat(y), group = experienceLevel), + stat = 'summary', fun = sum, vjust = -0.5, size=2 +) + + ylab("Count") + xlab ("Reported Experience Level") + + coord_cartesian(clip = "off") + + theme(plot.margin = margin(1,1,1,1.05, "cm")) + + scale_fill_manual(values = c("#035C94","#035385","#024A77","#024168", "#02395B")) + + theme(legend.position = "none")+ + ggtitle("How much experience do you have analyzing the following data categories?") + + +ggsave(here("plots/researchExperienceLevel_sequentialColor_noUserTypeSplit.png")) +``` + +## Experience: Controlled Access Datasets + +**Takeaway:** + +
Question and possible answers + +>What large, controlled access datasets do you access or would you be interested in accessing using the AnVIL? + +* All of Us* +* Centers for Common Disease Genomics (CCDG) +* The Centers for Mendelian Genomics (CMG) +* Clinical Sequencing Evidence-Generating Research (CSER) +* Electronic Medical Records and Genomics (eMERGE) +* Gabriella Miller Kids First (GMKF) +* Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR) +* The Genotype-Tissue Expression Project (GTEx) +* The Human Pangenome Reference Consortium (HPRC) +* Population Architecture Using Genomics and Epidemiology (PAGE) +* Undiagnosed Disease Network (UDN) +* UK Biobank* +* None +* Other (Free Text Response) + +Since this is a select all that apply question, we expect that there will be multiple responses that are comma separated. The free text responses will likely need recoded as well. The responses are in the `AccessWhichControlledData` column. + +
+ +### Prepare and plot the data + +
Description of variable definitions and steps for preparing the data + +Making a function `prep_df_whichData()` since we'll be using this workflow a few times for different subsets of the data, because we want to be able to differentially display the data based on the experience status (experienced with clinical research, human genomics research, etc.) of the person saying they'd like access to the data. + +We want to color the bars based on whether or not the controlled access dataset is available on the AnVIL currently. We create a dataframe `onAnVILDF` to report this. Used the [AnVIL dataset catalog/browser](https://explore.anvilproject.org/datasets) to find out this information. However, HPRC and GREGoR don't show up in that resource, but are both available per these sources: [Announcement for HPRC](https://anvilproject.org/news/2021/03/11/hprc-on-anvil), [Access for HPRC](https://anvilproject.org/data/consortia/HPRC), [Access for GREGoR](https://anvilproject.org/data/consortia/GREGoR). Both GMKF and TCGA are data hosted on other NCPI platforms that are accessible via AnVIL because of interoperability. (See: https://www.ncpi-acc.org/ and https://ncpi-data.org/platforms). We list these as non-AnVIL hosted since while accessible, they are not AnVIL hosted and inaccessible without NCPI. Finally, UDN is described as non-AnVIL hosted as it is in the Data submission pipeline and not yet available. + +We'll join this anvil-hosted or not data with the actual data at the end. + +Given the input `subset_df`, we expect several answer to be comma separated. Since there are 12 set possible responses (not including "None") and one possible free response answer, we separate the `AccessWhichControlledData` column into 13 columns ("WhichA" through "WhichN"), separating on a comma (specifically a ", " a comma followed by a space, otherwise there were duplicates where the difference was a leading space). Alternative approaches should [consider using `str_trim`](https://stringr.tidyverse.org/reference/str_trim.html). We set fill to "right" but this shouldn't really matter. It's just to suppress the unnecessary warning that they're adding NA's when there aren't 13 responses. If there's only one response, it'll put that response in `WhichA` and fill the rest of them with `NA`. If there's two responses, it'll put those two responses in `WhichA` and `WhichB` and fill the rest of them with `NA`... etc, + +We then use `pivot_longer` to grab these columns we just made and put the column names in a new column `WhichChoice` and the values in the each column to a new column `whichControlledAccess`. We drop all the NAs in this new `whichControlledAccess` column (and there's a lot of them there)... + +Then we group by the new `whichControlledAccess` column and summarize a count for how many there are for each response. + +Then we pass this to a mutate and recode function to simplify the fixed responses to be just their acronyms, to remove asterisks (that let the survey respondent know that that dataset wasn't available because of policy restrictions), and to recode the free text responses (details below in "Notes on free text response recoding"). + +We use a `left_join()` to join the cleaned data with a dataframe that specifies whether that dataset is currently available on the AnVIL or not. It's a left join rather than a full join so it's only adding the annotation for datasets that are available in the results. + +Finally, we return this subset and cleaned dataframe so that it can be plotted. + +
+ +
Additional notes on free text response recoding + +There were 4 "Other" free response responses + +* "Being able to pull other dbGap data as needed." + --> We recoded this to be an "Other" +* "GnomAD and ClinVar" + --> GnomAD and ClinVar are not controlled access datasets so we recoded that response to be "None" +* "Cancer omics datasets" + --> We recoded this to be an "Other" +* "TCGA" + --> This response was left as is since there is a controlled access tier. + +
+ +```{r, message = FALSE, echo = FALSE} +onAnVILDF <- read_delim(here("data/controlledAccessData_codebook.txt"), delim = "\t", col_select = c(whichControlledAccess, AnVIL_Availability)) + +prep_df_whichData <- function(subset_df, onAnVILDF = onAnVILDF){ + subset_df %<>% separate(AccessWhichControlledData, c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG", "WhichH", "WhichI", "WhichJ", "WhichK", "WhichM", "WhichN"), sep = ", ", fill="right") %>% + pivot_longer(starts_with("Which"), names_to = "WhichChoice", values_to = "whichControlledAccess") %>% + drop_na(whichControlledAccess) %>% + group_by(whichControlledAccess) %>% summarize(count = n()) %>% + mutate(whichControlledAccess = + recode(whichControlledAccess, + "All of Us*" = "All of Us", + "UK Biobank*" = "UK Biobank", + "Centers for Common Disease Genomics (CCDG)" = "CCDG", + "The Centers for Mendelian Genomics (CMG)" = "CMG", + "Clinical Sequencing Evidence-Generating Research (CSER)" = "CSER", + "Electronic Medical Records and Genomics (eMERGE)" = "eMERGE", + "Gabriella Miller Kids First (GMKF)" = "GMKF", + "Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR)" = "GREGoR", + "The Genotype-Tissue Expression Project (GTEx)" = "GTEx", + "The Human Pangenome Reference Consortium (HPRC)" = "HPRC", + "Population Architecture Using Genomics and Epidemiology (PAGE)" = "PAGE", + "Undiagnosed Disease Network (UDN)" = "UDN", + "Being able to pull other dbGap data as needed." = "Other", + "Cancer omics datasets" = "Other", + "GnomAD and ClinVar" = "None", #not controlled access + ) + ) %>% left_join(onAnVILDF, by="whichControlledAccess") + + return(subset_df) +} + +``` +
Description of variable definitions and steps for preparing the data continued + +Here we set up 4 data frames for plotting + +* The first uses all of the responses and sends them through the `prep_df_whichData()` function to clean the data for plotting to see which controlled access datasets are the most popular. +* The second filters to grab just the responses from those experienced in clinical research using the `clinicalFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection) +* The third filters to grab just the responses from those experienced in human genomic research using the `humanGenomicFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection) +* The fourth filters to grab just the responses from those experienced in non-human genomic research using the `nonHumanGenomicFlag` column (described earlier in the Clean Data -> Simplified experience status for various research categories (clinical, human genomics, non-human genomics) subsection) + +
+ +```{r, message=FALSE, echo = FALSE} +whichDataDf <- resultsTidy %>% prep_df_whichData(onAnVILDF = onAnVILDF) + +whichDataClinicalSubset <- resultsTidy %>% + filter(clinicalFlag == TRUE) %>% + prep_df_whichData(onAnVILDF = onAnVILDF) + +whichDataHumanGenomicSubset <- resultsTidy %>% + filter(humanGenomicFlag == TRUE) %>% + prep_df_whichData(onAnVILDF = onAnVILDF) + +whichDataNonHumanGenomicSubset <- resultsTidy %>% + filter(nonHumanGenomicFlag == TRUE) %>% + prep_df_whichData(onAnVILDF = onAnVILDF) + +``` + +
Description of variable definitions and steps for plotting the bar graphs + +Also have a function here because it's the same plotting steps for each just changing the subtitle and which dataframe is used as input. + +This takes the input dataframe and plots a bar plot with the x-axis having the controlled access datasets listed (reordering the listing based off of the count so most popular is on the left), the count number/popularity of requested is on the y-axis, and the fill is based on whether the dataset is available on AnVIL or not. + +We change the theme elements like removing panel borders, panel background, and panel grid, and rotate the x-axis tick labels. We add an x- and y- axis label and add a title (and subtitle if specified - which it will be when we're looking at just a subset like those who are experienced with clinical data) + +We also add text labels above the bars to say how many times each dataset was marked/requested. Note that we have to use the after_stat, summary, and sum way of doing it again because we use recoding and if we want the labels to be accurate, it has to capture every time we've recoded things to be the same after we used group_by and summarize to count before we recoded. It uses `coord_cartesian(clip = "off")` so these bar text labels aren't cut off and finally returns the plot. + +We call this function 4 times + +* once for all the data (and don't use a subtitle) +* next for just those experienced with clinical data (using a subtitle to specify this) +* next for just those experienced with human genomic data (using a subtitle to specify this) +* and finally for just those experienced with non-human genomic data (using a subtitle to specify this) + +
+ +```{r, message=FALSE, echo=FALSE} + +plot_which_data <- function(inputToPlotDF, subtitle = NULL){ + + toreturnplot <- ggplot(inputToPlotDF, aes(x = reorder(whichControlledAccess, -count), y = count, fill = AnVIL_Availability)) + + geom_bar(stat="identity") + + theme_classic() + theme(panel.background = element_blank(), panel.grid = element_blank()) + + theme(axis.text.x = element_text(angle=45, hjust=1)) + + xlab("Controlled access datasets") + ylab("Count") + + ggtitle("What large, controlled access datasets do you access\nor would you be interested in accessing using the AnVIL?", subtitle = subtitle) + + geom_text(aes(label = after_stat(y), group = whichControlledAccess), + stat = 'summary', fun = sum, vjust = -1, size=2) + + coord_cartesian(clip = "off") + + scale_fill_manual(values = c("#25445A", "#7EBAC0", "grey")) + + theme(legend.position = c(0.8, 0.8)) + +return(toreturnplot) + +} + +``` + + +```{r, message=FALSE, echo = FALSE} +everyoneDataPlot <- plot_which_data(whichDataDf) + +everyoneDataPlot + +ggsave(here("plots/whichcontrolleddata.png"), plot = everyoneDataPlot) +``` + +```{r, message=FALSE, echo=FALSE} +clinicalDataPlot <- plot_which_data(whichDataClinicalSubset, subtitle = "Respondents moderately or extremely experienced with clinical data") + +clinicalDataPlot + +ggsave(here("plots/whichcontrolleddata_clinical.png"), plot = clinicalDataPlot) +``` + +```{r, message=FALSE, echo=FALSE} +humanGenomicDataPlot <- plot_which_data(whichDataHumanGenomicSubset, subtitle = "Respondents moderately or extremely experienced with human genomic data") + +humanGenomicDataPlot + +ggsave(here("plots/whichcontrolleddata_humangenomic.png"), plot = humanGenomicDataPlot) +``` + +```{r, message=FALSE, echo=FALSE} +nonHumanGenomicDataPlot <- plot_which_data(whichDataNonHumanGenomicSubset, subtitle = "Respondents moderately or extremely experienced with non-human genomic data") + +nonHumanGenomicDataPlot + +ggsave(here("plots/whichcontrolleddata_nonhumangenomic.png"), plot = nonHumanGenomicDataPlot) +``` + +## Experience: Tool & Resource Knowledge/Comfort level + +**Takeaway:** + +### Prepare and plot the data + +
Description of variable definitions and steps for preparing the data + +
+ +```{r, message=FALSE, echo = FALSE} +toPlotToolKnowledge <- bind_rows( + resultsTidy %>% + filter(UserType == "CurrentUser") %>% + select(starts_with("Score_")) %>% + colSums(na.rm = TRUE) %>% + as.data.frame() %>% `colnames<-`(c("totalScore")) %>% + mutate(nscores = sum(resultsTidy$UserType == "CurrentUser"), + avgScore = totalScore / nscores, + UserType = "Current Users") %>% + mutate(WhereTool = rownames(.)) %>% + separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>% + mutate(AnVILorNo = + case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL", + AnVILorNo == "Score_All" ~ "Separate from the AnVIL" + ), + Tool = + recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks", + "WDL" = "Workflows", + "CommandLine" = "Unix / Command Line", + "AccessData" = "Access controlled access data", + "Terra" = "Terra Workspaces", + "BioconductorRStudio" = "Bioconductor & RStudio" + ) + ), + resultsTidy %>% + filter(UserType == "PotentialUser") %>% + select(starts_with("Score_AllTech")) %>% + colSums() %>% + as.data.frame() %>% `colnames<-`(c("totalScore")) %>% + mutate(nscores = sum(resultsTidy$UserType == "PotentialUser"), + avgScore = totalScore / nscores, + UserType = "Potential Users") %>% + mutate(WhereTool = rownames(.)) %>% + separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>% + mutate(AnVILorNo = + case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL", + AnVILorNo == "Score_All" ~ "Separate from the AnVIL" + ), + Tool = + recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks", + "WDL" = "Workflows", + "CommandLine" = "Unix / Command Line", + "AccessData" = "Access controlled access data", + "Terra" = "Terra Workspaces", + "BioconductorRStudio" = "Bioconductor & RStudio" + ) + ) +) %>% + mutate(UserType = factor(UserType, levels = c("Potential Users", "Current Users"))) +``` + + +```{r, message=FALSE, echo = FALSE} +roi <- toPlotToolKnowledge[which(toPlotToolKnowledge$Tool == "Bioconductor & RStudio"),] +toPlotToolKnowledge <- rows_append(toPlotToolKnowledge, data.frame( + UserType = rep(roi$UserType,2), + avgScore = rep(roi$avgScore,2), + AnVILorNo = rep(roi$AnVILorNo,2), + Tool = c("Bioconductor", "RStudio") + )) %>% + rows_delete(., data.frame(roi)) +``` + + +
Description of variable definitions and steps for plotting the dumbbell like plot + +Used [this Stack Overflow response](https://stackoverflow.com/a/72309061) to get the values for the `scale_shape_manual()` + +
+ +```{r, message=FALSE, echo = FALSE} +# Provide a list of AnVIL only Tools +AnVIL_only <- + setdiff(toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Current Users" & + toPlotToolKnowledge$AnVILorNo == "On the AnVIL", ]$Tool, + toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Potential Users", ]$Tool) + +# Order dummy column based only on Potential users +toPlotToolKnowledge <- + toPlotToolKnowledge %>% mutate(ToolOrder = case_when( + UserType == "Potential Users" | Tool %in% AnVIL_only ~ avgScore, + TRUE ~ 0 + )) + +PlotToolKnowledge_avg_score <- + ggplot(toPlotToolKnowledge, aes(y = reorder(Tool, avgScore), x = avgScore)) + + geom_point(aes(color = UserType, shape = AnVILorNo)) + +PlotToolKnowledge_potential_user_score <- + ggplot(data = toPlotToolKnowledge) + + geom_point(data = toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Potential Users" | toPlotToolKnowledge$Tool %in% AnVIL_only ,], + aes(color = UserType, shape = AnVILorNo, y = reorder(Tool, ToolOrder), x = avgScore)) + + geom_point(data = toPlotToolKnowledge[toPlotToolKnowledge$UserType == "Current Users",], + aes(color = UserType, shape = AnVILorNo, y = Tool, x = avgScore)) + +PlotToolKnowledge_customization <- function(gg) { + return( + gg + + scale_x_continuous(breaks = 0:5, labels = 0:5, limits = c(0,5)) + + ylab("Tool or Data Resource") + + xlab("Average Knowledge or Comfort Score") + + theme_bw() + + theme(panel.background = element_blank(), panel.grid.minor.x = element_blank()) + #facet_wrap(~UserType, nrow=3) + + annotation_custom(textGrob("Don't know\nat all", gp=gpar(fontsize=8, fontface = "bold")),xmin=0,xmax=0,ymin=-2,ymax=-2) + + annotation_custom(textGrob("Extremely\ncomfortable", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-2,ymax=-2) + + coord_cartesian(clip = "off") + + theme(plot.margin = margin(1,1,1,1.1, "cm")) + + ggtitle("How would you rate your knowledge of or\ncomfort with these technologies or data features?") + + scale_color_manual(values = c("#E0DD10", "#035C94")) + + scale_shape_manual(values = c(4, 16)) + ) +} + +PlotToolKnowledge_customization(PlotToolKnowledge_avg_score) +ggsave(here("plots/tooldataresourcecomfortscore_singlepanel.png"), w = 2200, h = 1350, units = "px") + +PlotToolKnowledge_customization(PlotToolKnowledge_potential_user_score) +ggsave(here("plots/tooldataresourcecomfortscore_singlepanel_by_potential_users.png"), w = 2200, h = 1350, units = "px") +``` + + +## Awareness: AnVIL Demos + +**Takeaway:** + +## Awareness: AnVIL Support Forum + +**Takeaway:** + +## Preferences: Feature importance for current vs potential users + +**Takeaway:** + +### Prepare and plot the data + +Average rank is total rank (sum of given ranks) divided by number of votes (number of given ranks) + +
Description of variable definitions and steps for preparing the data + +We make two different dataframes that find the total ranks (column name: `totalRank`) and avg ranks (column name: `avgRank`) for each future and then row bind (`bind_rows`) these two dataframes together to make `totalRanksdf`. The reason that we make two separately are that one is for Potential users (`starts_with("PotentialRank")`) and one is for Current users (`starts_with("CurrentRank")`). They have a different number of votes `nranks` and so it made more sense to work with them separately, following the same steps and then row bind them together. + +The individual steps for each of these dataframes is to + +* `select` the relevant columns from `resultsTidy` +* perform sums with `colSums`, adding together the ranks in those columns (each column corresponds to a queried feature); We set `na.rm = TRUE` to ignore the NAs (since not every survey respondent was asked each question; e.g., if they were a current user they weren't asked as a potential user) +* send those sums to a data frame such that the selected column names from the first step are now the row names and the total summed rank is the only column with values in each row corresponding to each queried feature +* Use a `mutate` to + * add a new column `nranks` that finds the number of responses in the survey are from potential users (e.g., the number that would have assigned ranks to the PotentialRank questions) or the number of responses in the survey that are from current/returning users (e.g., the number that would have assigned ranks to the CurrentRank questions). + * add a new column `avgRank` that divides the `totalRank` by the `nranks` + +After these two dataframes are bound together (`bind_rows`), the rest of the steps are for aesthetics in plotting and making sure ggplot knows the `UserType` and the feature of interest, etc. + +* We move the rownames to their own column `UsertypeFeature` (with the `mutate(UsertypeFeature = rownames(.))`). +* We separate the values in that column on the word "Rank" to remove the `UsertypeFeature` column we just made but then make two new columns (`Usertype` and `Feature`) where `Usertype is either "Current" or "Potential", and the Features are listed in the code below, because... +* We then use a `case_when` within a `mutate()` to fill out those features so they're more informative and show the choices survey respondents were given. + +
+ +```{r, message=FALSE, echo = FALSE} +totalRanksdf <- + bind_rows( + resultsTidy %>% + select(starts_with("PotentialRank")) %>% + colSums(na.rm = TRUE) %>% + as.data.frame() %>% `colnames<-`(c("totalRank")) %>% + mutate(nranks = sum(resultsTidy$UserType == "PotentialUser"), + avgRank = totalRank / nranks), + resultsTidy %>% + select(starts_with("CurrentRank")) %>% + colSums(na.rm = TRUE) %>% + as.data.frame() %>% `colnames<-`(c("totalRank")) %>% + mutate(nranks = sum(resultsTidy$UserType == "CurrentUser"), + avgRank = totalRank /nranks) + ) %>% + mutate(UsertypeFeature = rownames(.)) %>% + separate(UsertypeFeature, c("Usertype", "Feature"), sep = "Rank", remove = TRUE) %>% + mutate(Feature = + case_when(Feature == "EasyBillingSetup" ~ "Easy billing setup", + Feature == "FlatRateBilling" ~ "Flat-rate billing rather than use-based", + Feature == "FreeVersion" ~ "Free version with limited compute or storage", + Feature == "SupportDocs" ~ "On demand support and documentation", + Feature == "ToolsData" ~ "Specific tools or datasets are available/supported", + Feature == "CommunityAdoption" ~ "Greater adoption of the AnVIL by the scientific community"), + Usertype = factor(case_when(Usertype == "Potential" ~ "Potential Users", + Usertype == "Current" ~ "Current Users"), levels = c("Potential Users", "Current Users")) + ) +``` + +
Description of variable definitions and steps for plotting the dumbbell plot + +We use the `totalRanksdf` we just made. The x-axis is the `avgRank` values, and the y-axis displays the informative `Feature` values, however, we `reorder` the y-axis so that more important (lower number) avgRank features are displayed higher in the plot. + +geom_point and geom_line are used in conjunction to produce the dumbbell look of the plot and we set the color of the points to correspond to the `Usertype` + +Some theme things are changed, labels and titles added, setting the color to match AnVIL colors, and then we display and save that plot. + +The first version of the plot has trimmed limits, so the second version sets limits on the x-axis of 1 to 6 since those were the options survey respondents were given for ranking. It also adds annotations (using [Grobs, explained in this Stack Overflow post answer](https://stackoverflow.com/a/31081162)) to specify which rank was "Most important" and which was "Least important". + +Then we've also adjusted the left margin so that the annotation isn't cut off. + +We then display and save that version as well. + +Finally, we'll reverse the x-axis so that most important is on the right and least important is on the left. We use `scale_x_reverse()` for that. We have to change our group annotations so that they are now on the negative number version of `xmin` and `xmax` that we were using previously. We then display and save that version as well. + +
+ +```{r, message=FALSE, echo = FALSE} +gdumbbell <- ggplot(totalRanksdf, aes(x = avgRank, y = reorder(Feature, -avgRank))) + + geom_line() + + geom_point(aes(color = Usertype), size = 3) + + theme(panel.background = element_blank()) + theme_bw() + theme(legend.position = "bottom") + + xlab("Average Rank") + + ylab("Feature") + + ggtitle("Rank the following features\naccording to their importance to\nyou as a potential user or for\nyour continued use of the AnVIL") + + scale_color_manual(values = c("#E0DD10", "#035C94")) + + theme(legend.title = element_blank()) + +gdumbbell <- gdumbbell + + scale_x_continuous(breaks = 1:6, labels = 1:6, limits = c(1,6))+ + annotation_custom(textGrob("Most\nimportant", gp=gpar(fontsize=8, fontface = "bold")),xmin=1,xmax=1,ymin=-0.5,ymax=-0.5) + + annotation_custom(textGrob("Least\nimportant", gp=gpar(fontsize=8, fontface= "bold")),xmin=6,xmax=6,ymin=-0.5,ymax=-0.5) + + coord_cartesian(clip = "off") + + theme(plot.margin = margin(1,1,1,1.1, "cm")) + +ggsave(here("plots/dumbbellplot_xlim16_rankfeatures.png"), plot = gdumbbell) + +gdumbbell <- gdumbbell + + scale_x_reverse(limits = c(6,1), breaks = 6:1, labels = 6:1) + + annotation_custom(textGrob("Most\nimportant", gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) + + annotation_custom(textGrob("Least\nimportant", gp=gpar(fontsize=8, fontface= "bold")),xmin=-6,xmax=-6,ymin=-0.5,ymax=-0.5) + +gdumbbell + +ggsave(here("plots/dumbbellplot_xlim16_revaxis_rankfeatures.png"), plot = gdumbbell) + +``` + +## Preferences: Training Workshop Modality + +**Takeaway:** + +### Prepare and plot the data + +
Description of variable definitions and steps for preparing the data + +
+ +```{r, message=FALSE, echo = FALSE} +toPlotTrainingRanks <- bind_rows( + resultsTidy %>% + filter(UserType == "CurrentUser") %>% + select(starts_with("AnVILTrainingWorkshops")) %>% + colSums(na.rm = TRUE) %>% + as.data.frame() %>% `colnames<-`(c("totalRank")) %>% + mutate(nranks = sum(resultsTidy$UserType == "CurrentUser"), + avgRank = totalRank / nranks, + UserType = "Current Users") %>% + mutate(TrainingType = rownames(.)) %>% + mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", "")), + resultsTidy %>% + filter(UserType == "PotentialUser") %>% + select(starts_with("AnVILTrainingWorkshops")) %>% + colSums() %>% + as.data.frame() %>% `colnames<-`(c("totalRank")) %>% + mutate(nranks = sum(resultsTidy$UserType == "PotentialUser"), + avgRank = totalRank / nranks, + UserType = "Potential Users") %>% + mutate(TrainingType = rownames(.)) %>% + mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", "")) + ) %>% mutate(TrainingType = recode(TrainingType, "SpecEvent" = "AnVIL-specific event", "OnSite" = "On-site at my institution", "Conference" = "Conference (e.g., CSHL, AMIA)")) %>% + mutate(UserType = factor(UserType, levels = c("Potential Users", "Current Users"))) + +``` + +
Description of variable definitions and steps for plotting the dumbbell plot + +
+ +```{r, message=FALSE, echo = FALSE} +tdumbbell <- ggplot(toPlotTrainingRanks, aes(x = avgRank, y = reorder(TrainingType, -avgRank))) + + geom_line() + + geom_point(aes(color = UserType), size = 3) + + theme(panel.background = element_blank()) + theme_bw() + theme(legend.position = "bottom") + + xlab("Average Rank") + + ylab("Training Workshop Modality") + + ggtitle("Please rank how/where you would prefer to attend\nAnVIL training workshops.") + + scale_color_manual(values = c("#E0DD10", "#035C94")) + + theme(legend.title=element_blank()) + +tdumbbell <- tdumbbell + + scale_x_continuous(breaks = 5:1, labels = 5:1, limits = c(1,5))+ + annotation_custom(textGrob("Most\npreferred", gp=gpar(fontsize=8, fontface = "bold")),xmin=1,xmax=1,ymin=-0.5,ymax=-0.5) + + annotation_custom(textGrob("Least\npreferred", gp=gpar(fontsize=8, fontface= "bold")),xmin=5,xmax=5,ymin=-0.5,ymax=-0.5) + + coord_cartesian(clip = "off") + + theme(plot.margin = margin(1,1,1,1.1, "cm")) + +ggsave(here("plots/dumbbellplot_xlim15_trainingmodalitypref.png"), plot = tdumbbell) + +tdumbbell <- tdumbbell + + scale_x_reverse(limits = c(5,1)) + + annotation_custom(textGrob("Most\npreferred", gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) + + annotation_custom(textGrob("Least\npreferred", gp=gpar(fontsize=8, fontface= "bold")),xmin=-5,xmax=-5,ymin=-0.5,ymax=-0.5) + +tdumbbell + +ggsave(here("plots/dumbbellplot_xlim15_revaxis_trainingmodalitypref.png"), plot = tdumbbell) +``` + +## Returning User Specific: Likely to recommend? + +**Takeaway:** + +## Returning User Specific: Number of years of use + +**Takeaway:** + +## Returning User Specific: Foreseeable Computational Needs + +**Takeaway:** + +## Session Info and other analysis notes + +
Session Info + +```{r} +sessionInfo() +``` + +
\ No newline at end of file diff --git a/anvilPoll2024MainAnalysis.html b/anvilPoll2024MainAnalysis.html new file mode 100644 index 0000000..c2eed33 --- /dev/null +++ b/anvilPoll2024MainAnalysis.html @@ -0,0 +1,1695 @@ + + + + + + + + + + + + + + + +State of the AnVIL 2024 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
library(googlesheets4)
+library(tidyverse)
+library(magrittr) #for %<>%
+library(here)
+library(grid) #for Grobs and unit()
+
+

Read in data

+

Data were read in via a Google Sheet on the AnVIL Team Drive.

+
+ +Import details + +

The google sheet we are reading in is stored in an AnVIL Google drive +folder State of the AnVIL 2024. Its permissions are +restricted such that only people with access can open with the link. +Using gs4_auth() to authorize my google account before +running this code, I needed to change the scopes argument, +specifically scopes=spreadsheets.readonly was +necessary.

+

In this google sheet, each question is a column, and each response to +the survey is a row. If the respondant wasn’t asked or didn’t answer a +specific question, there is an NA in the corresponding row/column.

+
+
+
+

Clean data

+

Note: Every code block in this section edits the +resultsTidy data frame and should be run before plotting +within the # Insights section below. Subsections are marked +according to which Insight they are related to, but cleaning steps like +identifying the user type are important for most every plot.

+
+

Set Column Names

+

We set the column names to simplified column names (e.g., that help +us select related columns for various analyses) by reading in a codebook +(data/codebook.txt).

+
+ +Simplifying column names details + +
+ +Description of variable definitions and steps + +

We have a codebook that is a tab delmited file and has 4 columns, and +each row represents a question in the survey. The first column lists +a/the question from the survey (SurveyColNames); the second +column lists a corresponding simplified column name for that survey +question (SimplifedColNames); the third column describes +the variable format (VariableFormat), e.g, is it a double, +or a character; the fourth column gives a lengthier description of the +question (Description), e.g., who was asked it, what +possible answers are, etc.

+

This code block reads in that codebook and specifically selects the +SimplifiedColNames column. It then renames the column names +of the raw results from the google sheet (where each question is a +column) with these simplified column names.

+
+
simplifiedColNames <-
+  read_delim(here("data/codebook.txt"),
+             delim = "\t",
+             col_select = SimplifiedColNames)
+resultsTidy <-
+  resultsRaw %>% `colnames<-`(unlist(simplifiedColNames))
+
+
+
+

Keep last response if duplicated according to email (if email +provided)

+

Choosing to select the last response because the respondent may have +spent more time thinking about how they wanted to respond after their +initial response.

+
+ +Filtering duplicated responses details + +
+ +Description of variable definitions and steps + +
    +
  • The table function tabulates the number of occurrences, +and we tell it to ignore literal NAs. Because providing an email was +optional, we expect many NA responses. The table function, +by ignoring NAs, will return the unique emails and the number of times +each email was used. We store the tabulated results in the variable +tabulatedEmails
  • +
  • Using the sum function, we look to see how many +emails/responses are provided more than once. +tabulatedEmails > 1 is returning a vector of TRUEs and +FALSEs where TRUE means that there was more than one instance/count of a +given email and FALSE means there wasn’t. The sum function +in essence counts the number of TRUEs and if the sum is +greater than 0, that means there is at least one duplicated email whose +count is greater than 1.
  • +
  • duplicatedEmails reports which emails are duplicated by +using the tabulated/table of emails. First it identifies which emails +were observed more than once, using the which function, and +uses the indices returned from that to index the names of +the tabulated emails, grabbing the specific emails.
  • +
  • We want to know which entries from the overall survey responses to +remove for each duplicated email. Ideally, we want to remove the +responses all at the same time or go backwards removing one at a time, +because we don’t want to affect downstream indices. The approach here, +keeps track of all the indices of interest and removed them at the same +time. +
      +
    • Therefore, we’ll use lapply to loop through the +duplicated emails (duplicatedEmails) and grab the index for +survey responses associated with that email address +(which(resultsTidy$Email == duplicatedEmails[x])).
    • +
    • However, we want to keep the last survey response for each +duplicated email. Therefore, we wrap that which function in +head(_,-1 ) function so that it grabs all indices except +the last one.
    • +
    • Finally, we unlist the indices so that there’s a single +vector associated with indices for any duplicated email responses to be +removed IDXs_to_remove. And since we want to remove them +all at the same time, we subset resultsTidy, grabbing every +row except those in IDXs_to_remove, as denoted by the +-.
    • +
  • +
+
+
tabulatedEmails <- table(resultsTidy$Email, useNA = "no")
+
+if (sum(tabulatedEmails > 1) > 0) {
+  duplicatedEmails <-
+    names(tabulatedEmails)[which(tabulatedEmails > 1)]
+  IDXs_to_remove <-
+    unlist(lapply(1:length(duplicatedEmails), function(x)
+      head(
+        which(resultsTidy$Email == duplicatedEmails[x]),-1
+      )))
+  resultsTidy <- resultsTidy[-IDXs_to_remove, ]
+}
+
+nrow(resultsTidy)
+
## [1] 50
+
+
+
+

Identify type of user

+

The first question of the poll asks respondents to describe their +current usage of the AnVIL and allows us to categorize respondents as +potential or current users of the AnVIL.

+
+ +Question and possible answers + +
+

How would you describe your current usage the AnVIL platform?

+
+

Possible answers include:

+
    +
  • For completed/long-term projects (e.g., occasional +updates/maintenance as needed)
  • +
  • For ongoing projects (e.g., consistent project development and/or +work)
  • +
  • For short-term projects (e.g., short, intense bursts separated by a +few months)
  • +
  • I do no currently use the AnVIL, but have in the past
  • +
  • I have never heard of the AnVIL
  • +
  • I have never used the AnVIL, but have heard of it.
  • +
+

The first three possible answers represent current or returning AnVIL +users. The last three possible answers represent potential AnVIL +users.

+
+
+ +Identifying user type details + +
+ +Description of variable definitions and steps + +

We use case_when to evaluate the response in the +CurrentUsageDescription column and assign a corresponding, +simplified label of “CurrentUser” or “PotentialUser’. In other words we +translate the given response to a user label. Using the +case_when as the internal nested function of the +mutate function, means that the translation is then saved +in a new column, UserType.

+
+
resultsTidy %<>%
+  mutate(
+    UserType = case_when(
+      CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "CurrentUser",
+      CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "CurrentUser",
+      CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "CurrentUser",
+      CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "PotentialUser",
+      CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "PotentialUser",
+      CurrentUsageDescription == "I have never heard of the AnVIL" ~ "PotentialUser"
+    )
+  ) %>% 
+  mutate(UserType = factor(UserType, levels = c("PotentialUser", "CurrentUser")))
+
+
+
+

Institutional Affiliation: Synchronize Institution Names

+

Users were able to disclose their institutional affiliation using a +free text response, therefore we needed to synchronize institution names +(example: Johns Hopkins and Johns Hopkins University refer to the same +institution, despite the difference in the free responses) and added +simplified affiliation categories ([R1 University, R2 University, +Community College, Medical Center or School, International Location, +Research Center, NIH, Industry, Unknown] and [Research Intensive, +Education Focused, and Industry & Other]). The first level of +affiliation categories are notated in an institution specific codebook +(data/institution_codebook.txt)

+
+ +Question and possible answers + +
+

What institution are you affiliated with?

+
+

Free response for answers

+
+
+ +Institutional affiliation syncronizations details + +

This synchronization corrects for the various spellings and +capitalizations used for the same institution (ex, Johns Hopkins and +Johns Hopkins University refer to the same institution, despite the +difference in the free responses).

+
+ +Description of variable definitions and steps + +

We use a recode() within a mutate() to +synchronize the institutional affiliations as necessary

+
+
resultsTidy %<>%
+  mutate(
+    InstitutionalAffiliation =
+      recode(
+        InstitutionalAffiliation,
+        "Broad" = "Broad Institute",
+        "broad institute" = "Broad Institute",
+        "CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" = "City University of New York",
+        "harvard" = "Harvard University",
+        "Harvard Public Health" = "Harvard University",
+        "Johns hopkins" = "Johns Hopkins",
+        "Johns Hopkins University" = "Johns Hopkins",
+        "OHSU" = "Oregon Health & Science University",
+        "OHSU (Knight Center)" = "Oregon Health & Science University",
+        "The Ohio State University" = "Ohio State University",
+        "UCSC" = "University of California Santa Cruz",
+        "univ. ca. santa cruz" = "University of California Santa Cruz",
+        "university of California santa cruz" = "University of California Santa Cruz",
+        "UMASS Chan Medical School" = "UMass Chan Medical School",
+        "Umass Chan Medical School" = "UMass Chan Medical School",
+        "Washington University in St Louis" = "Washington University in St. Louis",
+        "yikongene" = "Yikon Genomics",
+        "v" = "Unknown"
+      )
+  )
+

Elizabeth Humphries grouped institutional affiliations into a limited +set of categories: R1 University, R2 University, Community College, +Medical Center or School, International Location, Research Center, NIH, +Industry, Unknown and we notated those groupings/labels within the +institution_codebook.txt data file, . Grouping into limited +institutional affiliation categories allows us to consolidate free +answers for easier data visualization and identification of trends.

+
+ +Description of variable definitions and steps + +

We use a read_delim() to read in the +institution_codebook file, and select just the +InstitutionalAffiliation and InstitutionalType +columns (ignoring the column that specifies how institutions were +entered by survey respondents). We then use a full_join by the +InstitutionalAffiliation column to add an +InstitutionalType column such that the category labels are +now included as a new column, joining the appropriate values dependent +upon the InstitutionalAffiliation column.

+
+
institutionCodeBook <- read_delim(here("data/institution_codebook.txt"), delim="\t", col_select = c(InstitutionalAffiliation, InstitutionalType))
+
+resultsTidy <- full_join(resultsTidy, institutionCodeBook, by = "InstitutionalAffiliation")
+

Here we even further simplify Institutional Affiliations to focus on +Research Intensive, Education Focused, and Industry & Other

+

This groups R1 University, Research Center, Medical Center or School, +and NIH as “Research Intensive”; R2 University & Community College +as “Education Focused”; and Industry, International Location, or Unknown +as “Industry & Other”.

+
resultsTidy %<>% 
+  mutate(FurtherSimplifiedInstitutionalType = 
+           case_when(
+             InstitutionalType == "R1 University" ~ "Research Intensive",
+             InstitutionalType == "Research Center" ~ "Research Intensive",
+             InstitutionalType == "Medical Center or School" ~ "Research Intensive",
+             InstitutionalType == "NIH" ~ "Research Intensive",
+             InstitutionalType == "R2 University" ~ "Education Focused",
+             InstitutionalType == "Community College" ~ "Education Focused",
+             InstitutionalType == "Industry" ~ "Industry & Other",
+             InstitutionalType == "International Location" ~ "Industry & Other",
+             InstitutionalType == "Unknown" ~ "Industry & Other"
+           )
+         )
+
+
+
+

Highest degree attained

+

This question allowed more than one response, however, only one +response selected two (PhD, MD), which we recoded to be MD/PhD. We +simplify the possible responses to group attained or in progress +degrees

+
+ +Question and possible answers + +
+

What is the highest degree you have attained?

+
+

Possible answers include (and multiple choices could be selected and +would be comma separated if so)

+
    +
  • High school or equivalent
  • +
  • Bachelor’s degree
  • +
  • Master’s degree in progress
  • +
  • Master’s degree
  • +
  • PhD in progress
  • +
  • PhD
  • +
  • MD in progress
  • +
  • MD
  • +
  • Other (with free text entry)
  • +
+
+
+ +Degree recoding details + +
+ +Description of variable definitions and steps + +

Because multiple responses could be selected and those would be comma +separated and because free text response was possible if other was +selected, we need to tidy the data from this question. From visual +inspection of the data, I see that the only time multiple responses were +selected were for MD/PhD. No other’s were selected. So we’ll just recode +“PhD, MD” to be “MD/PhD”

+

Let’s also set the factor levels to follow the general progress of +degrees

+
+
resultsTidy %<>%
+  mutate(
+    Degrees =
+      factor(recode(Degrees, "PhD, MD" = "MD/PhD"), levels = c("High School or equivalent", "Bachelor's degree", "Master's degree in progress", "Master's degree", "PhD in progress", "PhD", "MD in progress", "MD", "MD/PhD")),
+    FurtherSimplifiedDegrees = recode(Degrees, 
+                                      "Master's degree in progress" = "Master's degree (or in progress)",
+                                      "Master's degree" = "Master's degree (or in progress)",
+                                      "PhD in progress" = "PhD (or in progress)",
+                                      "PhD" = "PhD (or in progress)",
+                                      "MD/PhD" = "MD (MD, MD/PhD, or in progress)", 
+                                      "MD in progress" = "MD (MD, MD/PhD, or in progress)", 
+                                      "MD" = "MD (MD, MD/PhD, or in progress)"
+                          )
+  )
+
+
+
+

Tool Knowledge and Comfort Separate from the AnVIL and on the +AnVIL

+

We want to recode these responses to set the factor level/progression +from Don’t know it, not at all comfortable, all the way to extremely +comfortable and make corresponding integer comfort scores.

+
+ +Question and possible answers + +
+

How would you rate your knowledge of or comfort with these +technologies (separate from the AnVIL)? How would you rate your +knowledge of or comfort with these technologies (on the AnVIL)? How +would you rate your knowledge of or comfort with these AnVIL data +features? Shared technologies between these two questions include

+
+
    +
  • Jupyter Notebooks: CurrentAnVILTechJupyterNotebooks +& AllTechJupyterNotebooks
  • +
  • Bioconductor & RStudio: CurrentAnVILTechRStudio +& AllTechRStudio + +AllTechBioconductor
  • +
  • Galaxy: CurrentAnVILTechGalaxy & +AllTechGalaxy
  • +
  • WDL Workflows / Workflows (e.g., WDL): +CurrentAnVILTechWDL & +AllTechWorkflows
  • +
  • Containers: CurrentAnVILTechContainers & +AllTechContainers
  • +
  • Unix / Command Line: CurrentAnVILTechCommandLine & +AllTechCommandLine
  • +
+

Technologies only asked separate from the AnVIL

+
    +
  • Python: AllTechPython
  • +
  • R: AllTechR
  • +
+

Technologies/data features only asked with regards to the AnVIL

+
    +
  • Accessing controlled access datasets: +CurrentAnVILTechAccessData
  • +
  • DUOS (Data Use Oversight System): +CurrentAnVILTechDUOS
  • +
  • Terra on AnVIL (Workspaces): CurrentAnVILTechTerra
  • +
  • TDR (Terra Data Repository): CurrentAnVILTechTDR
  • +
+

Possible answers for each of these questions include

+
    +
  • Don’t know it (0)
  • +
  • Not at all comfortable (1)
  • +
  • Slightly comfortable (2)
  • +
  • Somewhat comfortable (3)
  • +
  • Moderately comfortable (4)
  • +
  • Extremely comfortable (5)
  • +
+

Notated possible “comfort scores” in parentheses next to each +possible answer. We’ll add these as additional columns that now start +with the word “Score_” but otherwise retain the column name, in case +it’s helpful to still have the words (whose factor level we’ll set to +reflect the progression of knowledge/comfort).

+

Responses are NA if the question wasn’t asked to the survey taker +(e.g., they were a potential user and weren’t asked about technologies +with regards to the AnVIL)

+
+
+ +Cleaning Comfort level/scores for various technologies and resources +details + +

It’s likely that someone who’s a program administrator will select +don’t know for these…. should we remove them and see how average scores +change?

+
+ +Description of variable definitions and steps + +

We select the relevant columns (those that start with +“CurrentAnVILTech” or “AllTech”) we want to work with. We don’t want +them to be lists. The non-tidyverse way of doing this would be +unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup)). +We can use the unnest tidyverse function with a +keep_empty = TRUE argument so that it preserves the NULL +values. Notice in the non-tidyverse way, we had to use +as.character in order to preserve the null values. In the +tidyverse way, we still have to use an as.character type change before +the unnest, otherwise, we get an error that double and +character values can’t be combined.

+

After the unnest we can use the mutate +function to first work with these as factors (to set the progression we +want from don’t know it all the way to extremely comfortable) and hten +to make the replacements specified above for an integer score in place +of the comfort level, placing these scores in new columns with names +that begin with “Score_” and fill in the rest of the column name with +the corresponding original column name.

+
+
resultsTidy %<>%
+  mutate(across(starts_with(c(
+    "CurrentAnVILTech", "AllTech"
+  )), as.character)) %>%
+  unnest(starts_with(c("CurrentAnVILTech", "AllTech")), keep_empty = TRUE) %>%
+  mutate(across(starts_with(c(
+    "CurrentAnVILTech", "AllTech"
+  )), ~ parse_factor(
+    .,
+    levels = c(
+      "Don't know it",
+      "Not at all comfortable",
+      "Slightly comfortable",
+      "Somewhat comfortable",
+      "Moderately comfortable",
+      "Extremely comfortable"
+    )
+  ))) %>%
+  mutate(across(
+    starts_with(c("CurrentAnVILTech", "ALLTech")),
+    ~ case_when(
+      . == "Don't know it" ~ 0,
+      . == "Not at all comfortable" ~ 1,
+      . == "Slightly comfortable" ~ 2,
+      . == "Somewhat comfortable" ~ 3,
+      . == "Moderately comfortable" ~ 4,
+      . == "Extremely comfortable" ~ 5
+    )
+    ,
+    .names = "Score_{.col}"
+  ))
+
+
+
+

Feature importance: Comparisons of rank of importance of +features/resources between Current Users and Potential Users

+

We want to recode these responses to remove labels and make them +integers.

+
+ +Question and possible answers + +
+

Rank the following features or resources according to their +importance for your continued use of the AnVIL

+
+
+

Rank the following features or resources according to their +importance to you as a potential user of the AnVIL?

+
+
    +
  • Easy billing setup
  • +
  • Flat-rate billing rather than use-based
  • +
  • Free version with limited compute or storage
  • +
  • On demand support and documentation
  • +
  • Specific tools or datasets are available/supported
  • +
  • Greater adoption of the AnVIL by the scientific community
  • +
+

We’re going to look at a comparison of the assigned ranks for these +features, comparing between current users and potential users.

+
+

Recode rank values

+
+ +Description of variable definitions and steps + +

Columns of interest include

+
    +
  • PotentialRankEasyBillingSetup
  • +
  • PotentialRankFlatRateBilling
  • +
  • PotentialRankFreeVersion
  • +
  • PotentialRankSupportDocs
  • +
  • PotentialRankToolsData
  • +
  • PotentialRankCommunityAdoption
  • +
  • CurrentRankEasyBillingSetup
  • +
  • CurrentRankFlatRateBilling
  • +
  • CurrentRankFreeVersion
  • +
  • CurrentRankSupportDocs
  • +
  • CurrentRankToolsData
  • +
  • CurrentRankCommunityAdoption
  • +
+
+
+ +Cleaning the feature importance ranks details + +
+ +Description of variable definitions and steps + +

We can use starts_with to select these columns, +specifically focusing on the starts with “PotentialRank” and +“CurrentRank”. When we made simplified names for the columns, these are +the only twelve that start like that.

+

Either the 6 CurrentRank or the 6 PotentialRank were asked to each +survey taker which means that we expect NULL values in these columns +since not every survey taker will have answered all of these +questions.

+

We want to recode the following values

+
    +
  • Replace 1 (Most important in this list) with 1
  • +
  • Replace 6 (Least important in this list) with 6
  • +
+

Before we can do that, we first need to change the type of the +columns in several ways. We don’t want them to be lists. The +non-tidyverse way of doing this would be +unlist(as.character(resultsTidy$PotentialRankEasyBillingSetup)). +We can use the unnest tidyverse function with a +keep_empty = TRUE argument so that it preserves the NULL +values. Notice in the non-tidyverse way, we had to use +as.character in order to preserve the null values. In the +tidyverse way, we still have to use an as.character type change before +the unnest, otherwise, we get an error that double and +character values can’t be combined.

+

After the unnest we can use the recode +function to make the replacements specified above. And then we go ahead +and change the type from character to integer so that we can compute +average rank & plot them more easily. There will be a warning that +NAs are introduced by coercion when we change the type to integer. So we +add a replacement in the recode, changing “NULL” to the +NA_character_

+
+
resultsTidy %<>%
+  mutate(across(starts_with(c(
+    "PotentialRank", "CurrentRank"
+  )), as.character)) %>%
+  unnest(starts_with(c("PotentialRank", "CurrentRank")), keep_empty = TRUE) %>%
+  mutate(across(
+    starts_with(c("PotentialRank", "CurrentRank")),
+    ~ recode(
+      .x,
+      "1 (Most important in this list)" = "1",
+      "6 (Least important in this list)" = "6",
+      "NULL" = NA_character_
+    )
+  )) %>%
+  mutate(across(starts_with(c(
+    "PotentialRank", "CurrentRank"
+  )), as.integer))
+
+
+
+
+

Training Modality Preference

+

We want to recode these responses to remove labels and make them +integers.

+
+ +Question and possible answers + +
+

Please rank how/where you would prefer to attend AnVIL training +workshops.

+
+

Possible answers include

+
    +
  • On-site at my institution: +AnVILTrainingWorkshopsOnSite
  • +
  • Virtual: AnVILTrainingWorkshopsVirtual
  • +
  • Conference (e.g., CSHL, AMIA): +AnVILTrainingWorkshopsConference
  • +
  • AnVIL-specific event: +AnVILTrainingWorkshopsSpecEvent
  • +
  • Other: AnVILTrainingWorkshopsOther
  • +
+

The responses are stored in the starts with +AnVILTrainingWorkshops columns

+
+
+ +Cleaning the training modality ranks details + +
+ +Description of variable definitions and steps + +

We can use starts_with to select these columns, +specifically focusing on the starts with “AnVILTrainingWorkshops”. These +are the only 5 that start like that when we made simplified column +names.

+

We want to recode the following values

+
    +
  • Replace 1 (Most preferred in this list) with 1
  • +
  • Replace 5 (Least preferred in this list) with 5
  • +
+

Before we can do that, we first need to change the type of the +columns in several ways. We don’t want them to be lists. We can use the +unnest tidyverse function with a +keep_empty = TRUE argument so that it preserves any NULL +values, but first we have to use an as.character type +change before the unnest, otherwise, we get an error that +double and character values can’t be combined.

+

After the unnest we can use the recode +function to make the replacements specified above. And then we go ahead +and change the type from character to integer so that we can compute +average rank & plot them more easily. There will be a warning that +NAs are introduced by coercion when we change the type to integer. So we +add a replacement in the recode, changing “NULL” to the +NA_character_

+
+
resultsTidy %<>%
+  mutate(across(starts_with(
+    "AnVILTrainingWorkshops"), as.character)) %>%
+  unnest(starts_with("AnVILTrainingWorkshops"), keep_empty = TRUE) %>%
+  mutate(across(
+    starts_with("AnVILTrainingWorkshops"),
+    ~ recode(
+      .x,
+      "1 (Most preferred in this list)" = "1",
+      "5 (Least preferred in this list)" = "5",
+      "NULL" = NA_character_
+    )
+  )) %>%
+  mutate(across(starts_with("AnVILTrainingWorkshop"), as.integer))
+
+
+
+

Simplified experience status for various research categories +(clinical, human genomics, non-human genomics)

+

Want to add three columns that act as flags reporting if the +respondent is

+
    +
  • experienced with clinical research, specifically either moderately +or extremely experienced in working with human clinical data
  • +
  • experienced with human genomics research, specifically is moderately +or extremely experienced in working with human genomics data
  • +
  • experienced with non-human genomics research expert, specifically is +moderately or extremely experienced in working with non-human genomics +data
  • +
+

We will use this information later to subset responses when +considering popular tools or datasets.

+
+ +Question and possible answers + +
+

How much experience do you have analyzing the following data +categories?

+
+

The three research categories people are asked about include

+
    +
  • Human Genomic
  • +
  • Non-human Genomic
  • +
  • Human Clinical
  • +
+

Possible answers include

+
    +
  • Not at all experienced
  • +
  • Slightly experienced
  • +
  • Somewhat experienced
  • +
  • Moderately experienced
  • +
  • Extremely experienced.
  • +
+
+
+ +Setting research category experience flag details + +
+ +Description of variable definitions and steps + +

We use a mutate together with 3 +case_when’s.

+
    +
  • If the HumanClinicalExperience column response is +“Moderately experienced” or “Extremely experienced”, we mark that +respondent as a human clinical research expert in the +clinicalFlag column (TRUE). Otherwise, we mark +a FALSE to signify they are not a clinical research +expert.
  • +
  • If the HumanGenomicExperience column response is +“Moderately experienced” or “Extremely experienced”, we mark that +respondent as a human genomic research expert in the +humanGenomicFlag column (TRUE). Otherwise, we +again mark a FALSE to signify not an expert.
  • +
  • If the NonHumanGenomicExperience column response is +“Moderately experienced” or “Extremely experienced”, we mark that +respondent as a non-human genomic research expert in the +nonHumanGenomicFlag column (TRUE). Otherwise, +we again mark a FALSE to signify not an expert.
  • +
+
+
resultsTidy %<>%
+  mutate(
+  clinicalFlag = case_when(
+           HumanClinicalExperience == "Moderately experienced" | HumanClinicalExperience == "Extremely experienced" ~ TRUE,
+           .default = FALSE
+         ),
+  humanGenomicFlag = case_when(
+           HumanGenomicExperience == "Moderately experienced" | HumanGenomicExperience == "Extremely experienced" ~ TRUE,
+           .default = FALSE
+         ),
+  nonHumanGenomicFlag = case_when(NonHumanGenomicExperience == "Moderately experienced" | NonHumanGenomicExperience == "Extremely experienced" ~ TRUE,
+          .default = FALSE)
+  )
+
+
+
+
+

Insights

+
+

Identify type of user

+

Takeaway: Of the 50 responses, +22 were current users and 28 were potential +users. The majority of current users belonged to the group who use the +AnVIL for ongoing projects while the majority of potential users were +evenly split between those who have never used the AnVIL (but have heard +of it) and those who used to previously use the AnVIL, but don’t +currently.

+

Potential Follow-ups:

+
    +
  • Look to see if those potential users who previously used to use the +AnVIL show similarity in overall trends with the rest of the potential +users
  • +
  • Directly ask why they no longer use the AnVIL (Elizabeth mentioned +the possiblity that the AnVIL is sometimes used in courses or workshops +and students may not use it after that)
  • +
+
+

Prepare and plot the data

+
+ +Description of variable definitions and steps + +

First, we group the data by the assigned UserType labels/categories +and their related more detailed descriptions. Then we use +summarize to count the occurances for each of those +categories. We use a mutate statement to better fit the detailed +descriptions on the plot. We then send this data to ggplot with the +count on the x-axis, and the usage descriptions on the y-axis (ordered +by count so highest count is on the top). We fill with the usertype +description we’ve assigned. We manually scale the fill to be AnVIL +colors and specify we want this to be a stacked bar chart. We then make +edits for the theme and labels and finally add a geom_text label for the +count next to the bars before we save the plot.

+
+

+
+
+
+

Demographics: Institutional Affiliation

+

Takeaway:

+
+

Prepare and plot the data

+

+
+
+
+

Demographics: Highest Degree Attained

+

Takeaway:

+
+

Prepare and plot the data

+
+ +Description of variable definitions and steps + +

First we use group_by() to selectDegrees +and UserType in conjunction with +summarize( = n()) to add counts for how many of each combo +are observed in the data.

+

Then we send this data to ggplot and make a bar chart with the x-axis +representing the degrees (reordered by the count number +such that higher counts are first (and the sum) because otherwise the 2 +MDs are located after the high school and master’s in progress bars (1 +each)). The y-axis represents the count, and the fill is used to specify +user type (current or potential AnVIL users). We use a stacked bar chart +and include labels above each bar of the total sum for that degree +type.

+

Used this +stackoverflow post to label sums above the bars

+

and used this +stackoverflow post to remove NA from the legend

+

The rest of the changes are related to theme and labels and making +sure that the numerical bar labels aren’t cut off on the top.

+
+

+
+
+
+

Experience: Genomics and Clinical Research Experience

+

Takeaway:

+
+

Prepare and plot the data

+
+ +Description of variable definitions and steps for preparing the data + +

Here we select the columns containing answers for each data category: +HumanGenomicExperience, +HumanClinicalExperience, and +NonHumanGenomicExperience. We also select +UserType in case we want to split user type out at all in +viewing the data. We use a pivot_longer to make a long +dataframe that can be grouped and groups counted. The category/column +names go to a new column, researchType and the values in +those columns go to a new column experienceLevel. Before we +use group by and count, we set the factor level on the new +experienceLevel column to match the progression from not at +all experienced to extremely experienced, and we rename the research +categories so that the words have spaces, and we say research instead of +experience. Then we use group_by and summarize +to add counts for each combination of research category, experience +level, and usertype. These counts are in the new n +column.

+
+
+ +Description of variable definitions and steps for plotting the bar graph + +

We didn’t observe big differences between current and potential +users, so we believe this grouped plot is useful for understanding the +community as a whole.

+

This bar plot has the experience level on the x-axis, the count on +the y-axis, and fills the bars according to the experience level (though +the fill/color legend is turned off by setting legend.position to none). +We facet the research category type and label the bars. We keep a +summary stat and sum function and after_stat(y) for the label since the +data has splits like usertype that we’re not visualizing here.

+

We adjust various aspects of the theme like turning off the grid and +background and rotating the x-tick labels and changing the x- and y-axis +labels. We also slightly widen the left axis so that the tick labels +aren’t cut off.

+
+

+
+
+
+

Experience: Controlled Access Datasets

+

Takeaway:

+
+ +Question and possible answers + +
+

What large, controlled access datasets do you access or would you be +interested in accessing using the AnVIL?

+
+
    +
  • All of Us*
  • +
  • Centers for Common Disease Genomics (CCDG)
  • +
  • The Centers for Mendelian Genomics (CMG)
  • +
  • Clinical Sequencing Evidence-Generating Research (CSER)
  • +
  • Electronic Medical Records and Genomics (eMERGE)
  • +
  • Gabriella Miller Kids First (GMKF)
  • +
  • Genomics Research to Elucidate the Genetics of Rare Diseases +(GREGoR)
  • +
  • The Genotype-Tissue Expression Project (GTEx)
  • +
  • The Human Pangenome Reference Consortium (HPRC)
  • +
  • Population Architecture Using Genomics and Epidemiology (PAGE)
  • +
  • Undiagnosed Disease Network (UDN)
  • +
  • UK Biobank*
  • +
  • None
  • +
  • Other (Free Text Response)
  • +
+

Since this is a select all that apply question, we expect that there +will be multiple responses that are comma separated. The free text +responses will likely need recoded as well. The responses are in the +AccessWhichControlledData column.

+
+
+

Prepare and plot the data

+
+ +Description of variable definitions and steps for preparing the data + +

Making a function prep_df_whichData() since we’ll be +using this workflow a few times for different subsets of the data, +because we want to be able to differentially display the data based on +the experience status (experienced with clinical research, human +genomics research, etc.) of the person saying they’d like access to the +data.

+

We want to color the bars based on whether or not the controlled +access dataset is available on the AnVIL currently. We create a +dataframe onAnVILDF to report this. Used the AnVIL dataset +catalog/browser to find out this information. However, HPRC and +GREGoR don’t show up in that resource, but are both available per these +sources: Announcement +for HPRC, Access for HPRC, +Access for +GREGoR. Both GMKF and TCGA are data hosted on other NCPI platforms +that are accessible via AnVIL because of interoperability. (See: https://www.ncpi-acc.org/ and https://ncpi-data.org/platforms). We list these as +non-AnVIL hosted since while accessible, they are not AnVIL hosted and +inaccessible without NCPI. Finally, UDN is described as non-AnVIL hosted +as it is in the Data submission pipeline and not yet available.

+

We’ll join this anvil-hosted or not data with the actual data at the +end.

+

Given the input subset_df, we expect several answer to +be comma separated. Since there are 12 set possible responses (not +including “None”) and one possible free response answer, we separate the +AccessWhichControlledData column into 13 columns (“WhichA” +through “WhichN”), separating on a comma (specifically a “,” a comma +followed by a space, otherwise there were duplicates where the +difference was a leading space). Alternative approaches should consider +using str_trim. We set fill to “right” but this +shouldn’t really matter. It’s just to suppress the unnecessary warning +that they’re adding NA’s when there aren’t 13 responses. If there’s only +one response, it’ll put that response in WhichA and fill +the rest of them with NA. If there’s two responses, it’ll +put those two responses in WhichA and WhichB +and fill the rest of them with NA… etc,

+

We then use pivot_longer to grab these columns we just +made and put the column names in a new column WhichChoice +and the values in the each column to a new column +whichControlledAccess. We drop all the NAs in this new +whichControlledAccess column (and there’s a lot of them +there)…

+

Then we group by the new whichControlledAccess column +and summarize a count for how many there are for each response.

+

Then we pass this to a mutate and recode function to simplify the +fixed responses to be just their acronyms, to remove asterisks (that let +the survey respondent know that that dataset wasn’t available because of +policy restrictions), and to recode the free text responses (details +below in “Notes on free text response recoding”).

+

We use a left_join() to join the cleaned data with a +dataframe that specifies whether that dataset is currently available on +the AnVIL or not. It’s a left join rather than a full join so it’s only +adding the annotation for datasets that are available in the +results.

+

Finally, we return this subset and cleaned dataframe so that it can +be plotted.

+
+
+ +Additional notes on free text response recoding + +

There were 4 “Other” free response responses

+
    +
  • “Being able to pull other dbGap data as needed.” –> We recoded +this to be an “Other”
  • +
  • “GnomAD and ClinVar” –> GnomAD and ClinVar are not controlled +access datasets so we recoded that response to be “None”
  • +
  • “Cancer omics datasets” –> We recoded this to be an “Other”
  • +
  • “TCGA” –> This response was left as is since there is a +controlled access tier.
  • +
+
+
+ +Description of variable definitions and steps for preparing the data +continued + +

Here we set up 4 data frames for plotting

+
    +
  • The first uses all of the responses and sends them through the +prep_df_whichData() function to clean the data for plotting +to see which controlled access datasets are the most popular.
  • +
  • The second filters to grab just the responses from those experienced +in clinical research using the clinicalFlag column +(described earlier in the Clean Data -> Simplified experience status +for various research categories (clinical, human genomics, non-human +genomics) subsection)
  • +
  • The third filters to grab just the responses from those experienced +in human genomic research using the humanGenomicFlag column +(described earlier in the Clean Data -> Simplified experience status +for various research categories (clinical, human genomics, non-human +genomics) subsection)
  • +
  • The fourth filters to grab just the responses from those experienced +in non-human genomic research using the nonHumanGenomicFlag +column (described earlier in the Clean Data -> Simplified experience +status for various research categories (clinical, human genomics, +non-human genomics) subsection)
  • +
+
+
+ +Description of variable definitions and steps for plotting the bar +graphs + +

Also have a function here because it’s the same plotting steps for +each just changing the subtitle and which dataframe is used as +input.

+

This takes the input dataframe and plots a bar plot with the x-axis +having the controlled access datasets listed (reordering the listing +based off of the count so most popular is on the left), the count +number/popularity of requested is on the y-axis, and the fill is based +on whether the dataset is available on AnVIL or not.

+

We change the theme elements like removing panel borders, panel +background, and panel grid, and rotate the x-axis tick labels. We add an +x- and y- axis label and add a title (and subtitle if specified - which +it will be when we’re looking at just a subset like those who are +experienced with clinical data)

+

We also add text labels above the bars to say how many times each +dataset was marked/requested. Note that we have to use the after_stat, +summary, and sum way of doing it again because we use recoding and if we +want the labels to be accurate, it has to capture every time we’ve +recoded things to be the same after we used group_by and summarize to +count before we recoded. It uses `coord_cartesian(clip = “off”) so these +bar text labels aren’t cut off and finally returns the plot.

+

We call this function 4 times

+
    +
  • once for all the data (and don’t use a subtitle)
  • +
  • next for just those experienced with clinical data (using a subtitle +to specify this)
  • +
  • next for just those experienced with human genomic data (using a +subtitle to specify this)
  • +
  • and finally for just those experienced with non-human genomic data +(using a subtitle to specify this)
  • +
+
+
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
+## 3.5.0.
+## â„ą Please use the `legend.position.inside` argument of `theme()` instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+

+

+

+

+
+
+
+

Experience: Tool & Resource Knowledge/Comfort level

+

Takeaway:

+
+

Prepare and plot the data

+
+ +Description of variable definitions and steps for preparing the data + +
+
+ +Description of variable definitions and steps for plotting the dumbbell +like plot + +

Used this +stackoverflow response to get the values for the +scale_shape_manual()

+
+

+
+
+
+

Awareness: AnVIL Demos

+

Takeaway:

+
+
+

Awareness: AnVIL Support Forum

+

Takeaway:

+
+
+

Preferences: Feature importance for current vs potential users

+

Takeaway:

+
+

Prepare and plot the data

+

Average rank is total rank (sum of given ranks) divided by number of +votes (number of given ranks)

+
+ +Description of variable definitions and steps for preparing the data + +

We make two different dataframes that find the total ranks (column +name: totalRank) and avg ranks (column name: +avgRank) for each future and then row bind +(bind_rows) these two dataframes together to make +totalRanksdf. The reason that we make two separately are +that one is for Potential users +(starts_with("PotentialRank")) and one is for Current users +(starts_with("CurrentRank")). They have a different number +of votes nranks and so it made more sense to work with them +separately, following the same steps and then row bind them +together.

+

The individual steps for each of these dataframes is to

+
    +
  • select the relevant columns from +resultsTidy
  • +
  • perform sums with colSums, adding together the ranks in +those columns (each column corresponds to a queried feature); We set +na.rm = TRUE to ignore the NAs (since not every survey +respondent was asked each question; e.g., if they were a current user +they weren’t asked as a potential user)
  • +
  • send those sums to a data frame such that the selected column names +from the first step are now the row names and the total summed rank is +the only column with values in each row corresponding to each queried +feature
  • +
  • Use a mutate to +
      +
    • add a new column nranks that finds the number of +responses in the survey are from potential users (e.g., the number that +would have assigned ranks to the PotentialRank questions) or the number +of responses in the survey that are from current/returning users (e.g., +the number that would have assigned ranks to the CurrentRank +questions).
    • +
    • add a new column avgRank that divides the +totalRank by the nranks
    • +
  • +
+

After these two dataframes are bound together +(bind_rows), the rest of the steps are for aesthetics in +plotting and making sure ggplot knows the usertype and the feature of +interest, etc.

+
    +
  • We move the rownames to their own column +UsertypeFeature (with the +mutate(UsertypeFeature = rownames(.))).
  • +
  • We separate the values in that column on the word “Rank” to remove +the UsertypeFeature column we just made but then make two +new columns (Usertype and Feature) where +`Usertype is either “Current” or “Potential”, and the Features are +listed in the code below, because…
  • +
  • We then use a case_when within a mutate() +to fill out those features so they’re more informative and show the +choices survey respondents were given.
  • +
+
+
+ +Description of variable definitions and steps for plotting the dumbbell +plot + +

We use the totalRanksdf we just made. The x-axis is the +avgRank values, and the y-axis displays the informative +Feature values, however, we reorder the y-axis +so that more important (lower number) avgRank features are displayed +higher in the plot.

+

geom_point and geom_line are used in conjunction to produce the +dumbbell look of the plot and we set the color of the points to +correspond to the Usertype

+

Some theme things are changed, labels and titles added, setting the +color to match AnVIL colors, and then we display and save that plot.

+

The first version of the plot has trimmed limits, so the second +version sets limits on the x-axis of 1 to 6 since those were the options +survey respondents were given for ranking. It also adds annotations +(using Grobs, explained +in this Stackoverflow post answer) to specify which rank was “Most +important” and which was “Least important”.

+

Then we’ve also adjusted the left margin so that the annotation isn’t +cut off.

+

We then display and save that version as well.

+

Finally, we’ll reverse the x-axis so that most important is on the +right and least important is on the left. We use +scale_x_reverse() for that. We have to change our group +annotations so that they are now on the negative number version of +xmin and xmax that we were using previously. +We then display and save that version as well.

+
+

+
+
+
+

Preferences: Training Workshop Modality

+

Takeaway:

+
+

Prepare and plot the data

+
+ +Description of variable definitions and steps for preparing the data + +
+
+ +Description of variable definitions and steps for plotting the dumbbell +plot + +
+

+
+
+
+

Returning User Specific: Likely to recommend?

+

Takeaway:

+
+
+

Returning User Specific: Number of years of use

+

Takeaway:

+
+
+

Returning User Specific: Foreseeable Computational Needs

+

Takeaway:

+
+
+

Session Info and other analysis notes

+
+ +Session Info + +
sessionInfo()
+
## R version 4.4.0 (2024-04-24)
+## Platform: aarch64-apple-darwin20
+## Running under: macOS Sonoma 14.4.1
+## 
+## Matrix products: default
+## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: America/New_York
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] grid      stats     graphics  grDevices utils     datasets  methods  
+## [8] base     
+## 
+## other attached packages:
+##  [1] here_1.0.1          magrittr_2.0.3      lubridate_1.9.3    
+##  [4] forcats_1.0.0       stringr_1.5.1       dplyr_1.1.4        
+##  [7] purrr_1.0.2         readr_2.1.5         tidyr_1.3.1        
+## [10] tibble_3.2.1        ggplot2_3.5.1       tidyverse_2.0.0    
+## [13] googlesheets4_1.1.1
+## 
+## loaded via a namespace (and not attached):
+##  [1] gtable_0.3.5      xfun_0.43         bslib_0.7.0       gargle_1.5.2     
+##  [5] tzdb_0.4.0        vctrs_0.6.5       tools_4.4.0       generics_0.1.3   
+##  [9] curl_5.2.1        parallel_4.4.0    fansi_1.0.6       highr_0.10       
+## [13] pkgconfig_2.0.3   lifecycle_1.0.4   compiler_4.4.0    farver_2.1.1     
+## [17] textshaping_0.3.7 munsell_0.5.1     htmltools_0.5.8.1 sass_0.4.9       
+## [21] yaml_2.3.8        pillar_1.9.0      crayon_1.5.2      jquerylib_0.1.4  
+## [25] openssl_2.1.2     cachem_1.0.8      tidyselect_1.2.1  digest_0.6.35    
+## [29] stringi_1.8.3     labeling_0.4.3    rprojroot_2.0.4   fastmap_1.1.1    
+## [33] colorspace_2.1-0  cli_3.6.2         utf8_1.2.4        withr_3.0.0      
+## [37] scales_1.3.0      rappdirs_0.3.3    bit64_4.0.5       googledrive_2.1.1
+## [41] timechange_0.3.0  rmarkdown_2.26    httr_1.4.7        bit_4.0.5        
+## [45] cellranger_1.1.0  askpass_1.2.0     ragg_1.3.0        hms_1.1.3        
+## [49] evaluate_0.23     knitr_1.46        rlang_1.1.3       glue_1.7.0       
+## [53] rstudioapi_0.16.0 vroom_1.6.5       jsonlite_1.8.8    R6_2.5.1         
+## [57] systemfonts_1.0.6 fs_1.6.4
+
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/config_automation.yml b/config_automation.yml index 12d9dd0..fbfac6a 100644 --- a/config_automation.yml +++ b/config_automation.yml @@ -21,4 +21,4 @@ render-coursera: no # What docker image should be used for rendering? # The default is jhudsl/base_ottr:main -rendering-docker-image: 'jhudsl/base_ottr:main' +rendering-docker-image: 'jhudsl/anvil-poll-2024:main' diff --git a/contact.Rmd b/contact.Rmd deleted file mode 100644 index fc4eedc..0000000 --- a/contact.Rmd +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: "**Contact Us**" -output: - html_document ---- - - - -If you have questions please contact: - -* Carrie Wright (cwrigh60@jhu.edu) diff --git a/data/codebook.txt b/data/codebook.txt new file mode 100644 index 0000000..6aa2093 --- /dev/null +++ b/data/codebook.txt @@ -0,0 +1,62 @@ +SurveyColNames SimplifiedColNames VariableFormat Description +Timestamp Timestamp double Date and time of survey response submission +How would you describe your current usage of the AnVIL platform? CurrentUsageDescription character "Possible answers include For completed/long-term projects (e.g., occasional updates/maintenance as needed), For ongoing projects (e.g., consistent project development and/or work), For short-term projects (e.g., short, intense bursts separated by a few months). These responses are considered current or returning AnVIL users. Other possible responses represent potential AnVIL users. These include I do no currently use the AnVIL, but have in the past, I have never heard of the AnVIL, I have never used the AnVIL, but have heard of it." +Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Easy billing setup] PotentialRankEasyBillingSetup list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Flat-rate billing rather than use-based] PotentialRankFlatRateBilling list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Free version with limited compute or storage] PotentialRankFreeVersion list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [On demand support and documentation] PotentialRankSupportDocs list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Specific tools or datasets are available/supported] PotentialRankToolsData list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance to you as a potential user of the AnVIL? [Greater adoption of the AnVIL by the scientific community] PotentialRankCommunityAdoption list "This question was only given to potential AnVIL users. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +How long have you been using the AnVIL? LengthOfUse character "This question was only given to current or returning AnVIL users. Possible responses include < 1 yr, 1-2 yrs, 2-3 yrs, 4+ yrs, or NA. NA applies to survey responses where this questions wasn't asked" +Rank the following features or resources according to their importance for your continued use of the AnVIL [Easy billing and workgroup management] CurrentRankEasyBillingSetup list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankEasyBillingSetup. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance for your continued use of the AnVIL [Flat-rate billing rather than use-based] CurrentRankFlatRateBilling list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankFlatRateBilling. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance for your continued use of the AnVIL [Free version with limited compute or storage] CurrentRankFreeVersion list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankFreeVersion. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance for your continued use of the AnVIL [On demand support and documentation] CurrentRankSupportDocs list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankSupportDocs. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance for your continued use of the AnVIL [Specific tools or datasets are available/supported] CurrentRankToolsData list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankToolsData. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +Rank the following features or resources according to their importance for your continued use of the AnVIL [Greater adoption of the AnVIL by the scientific community] CurrentRankCommunityAdoption list "This question was only given to current or returning AnVIL users. It relates to the potential ranking one from PotentialRankCommunityAdoption. Possible values are 1 (Most important in this list), 2, 3, 4, 5, 6 (Least important in this list), or NULL. NULL applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Jupyter Notebooks] CurrentAnVILTechJupyterNotebooks character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Bioconductor & RStudio] CurrentAnVILTechBioconductorRStudio character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Galaxy] CurrentAnVILTechGalaxy character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [WDL Workflows] CurrentAnVILTechWDL character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Containers] CurrentAnVILTechContainers character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these technologies (on the AnVIL)? [Unix / Command Line] CurrentAnVILTechCommandLine character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +"How would you rate your knowledge of or comfort with these AnVIL data features? [Accessing controlled access datasets (e.g., GTEx, CCDG)]" CurrentAnVILTechAccessData character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these AnVIL data features? [DUOS (Data Use Oversight System)] CurrentAnVILTechDUOS character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these AnVIL data features? [Terra on AnVIL (Workspaces)] CurrentAnVILTechTerra character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +How would you rate your knowledge of or comfort with these AnVIL data features? [TDR (Terra Data Repository)] CurrentAnVILTechTDR character "This question was only given to current or returning AnVIL users asking about their comfort with certain AnVIL tech. Possible answers were Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable, or NA. NA applies to survey responses where this question wasn't asked" +What one feature in the AnVIL do you appreciate the most? FeatureMostAppreciated character "This question was only asked to current or returning AnVIL users. Possible answers include Access to controlled access datasets, WDL Workflows, Bioconductor & RStudio on AnVIL, Jupyter Notebooks on AnVIL, Galaxy on AnVIL, Terra on AnVIL, Dockstore for workflows/containers, Available Support, Other (with free text entry if Other is selected), or NA. NA applies to survey responses where this question wasn't asked." +What computational and storage resources do you foresee needing in the next 12 months? (Select all that apply) NeededResources character "This select all question was only asked to current or returning AnVIL users. If multiple answers were selected, the answers will be comma separated. Possible answers include Many nodes, GPUs, Lots of storage (Terabytes), Large memory (>192 GB RAM), I don't know, Other (with free text entry if Other is selected), or NA. NA applies to survey responses where this question wasn't asked." +How likely are you to recommend the AnVIL to a colleague? RecommendationLikelihood double "This question was only asked to current or returning AnVIL users. Possible answers include 1, 2, 3, 4, 5, or NA. 1 corresponds to Not at all likely; and 5 corresponds to Extremely likely. NA applies to survey responses where this question wasn't asked." +What is the highest degree you have attained? Degrees character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include High school or equivalent, Bachelor's degree, Master's degree in progress, Master's degree, PhD in progress, PhD, MD in progress, MD, Other (with free text entry if Other is selected)." +Which industry do you work in? (Select all that apply) Industry character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include Academia/University, Research Institute, Clinical/Hospital, Biotech (includes start up), Pharmaceutical, Government, Other (with free text entry if Other is selected)." +What kind of work do you do? (Check up to 2 that you do most often) KindOfWork character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include Computational work, Engineering work, Wet lab work, Clinical work, Computational education, Wet lab education, Project leadership, Project management, Program administration, Other (with free text entry if Other is selected). We asked for people to select up to 2, but several selected more than 2." +What institution are you affiliated with? InstitutionalAffiliation character "This question was given to every survey taker, and was free response." +"Please list any relevant consortia affiliations (e.g., your consortium performs work or stores data on the AnVIL). Please enter NA if not relevant." ConsortiaAffiliations character "This question was given to every survey taker, and was free response. Users were asked to reply with NA if not relevant. There are na and NA responses. If more than one consortium is listed, responders separated responses by commas, semicolons, ampersands, or the word and." +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Python] AllTechPython character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. There isn't a directly related question to the CurrentAnVILTech ones. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Jupyter Notebooks] AllTechJupyterNotebooks character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechJupyterNotebooks one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [R] AllTechR character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. There isn't a directly related question to the CurrentAnVILTech ones. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [RStudio] AllTechRStudio character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechRStudio one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Bioconductor] AllTechBioconductor character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechRStudio one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Galaxy] AllTechGalaxy character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechGalaxy one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +"How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Workflows (e.g., WDL)]" AllTechWorkflows character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechWDL one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Containers] AllTechContainers character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechContainers one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +How would you rate your knowledge of or comfort with these technologies (separate from the AnVIL)? [Unix / Command Line] AllTechCommandLine character "This question was given to every survey taker about the tech in general, separate from its incorporation into AnVIL. This is related to question CurrentAnVILTechCommandLine one. Possible responses included Don't know it, Not at all comfortable, Slightly comfortable, Somewhat comfortable, Moderately comfortable, Extremely comfortable" +What types of data do you or would you analyze using the AnVIL? (Select all that apply) TypesOfData character "This question was given to every survey taker, and more than one answer could be selected, if so, they would be separated by commas. Possible answers include Genomes/exomes, Transcriptomes, Metagenomes, Proteomes, Metabolomes, Epigenomes, Structural, Single Cell, Imaging, Phenotypic, Electronic Health Record, Metadata, Survey, and Other (with free text entry if Other is selected). Other responses seem to be either very specific or something like not analyzing data on AnVIL." +How much experience do you have analyzing the following data categories? [Human genomic] HumanGenomicExperience character "This question was given to every survey taker. Possible answers include Not at all experienced, Slightly experienced, Somewhat experienced, Moderately experienced, or Extremely experienced." +How much experience do you have analyzing the following data categories? [Non-human genomic] NonHumanGenomicExperience character "This question was given to every survey taker. Possible answers include Not at all experienced, Slightly experienced, Somewhat experienced, Moderately experienced, or Extremely experienced." +How much experience do you have analyzing the following data categories? [Human clinical] HumanClinicalExperience character "This question was given to every survey taker. Possible answers include Not at all experienced, Slightly experienced, Somewhat experienced, Moderately experienced, or Extremely experienced." +"How interested are you in working with controlled access datasets? (e.g., GTEx, CCDG)" InterestControlledData double "This question was given to every survey taker. Possible answers include 1, 2, 3, 4, 5. 1 corresponds to Not at all interested. 5 corresponds to Extremely interested." +"What large, controlled access datasets do you access or would you be interested in accessing using the AnVIL? Those marked with an asterisk (*) are not currently available on the AnVIL due to policy restrictions. (Select all that apply)" AccessWhichControlledData character "This question was asked to all survey takers and more than one answer could be selected. If multiple answers are selected, the answers are comma separated. Possible answers include All of Us*, Centers for Common Disease Genomics (CCDG), The Centers for Mendelian Genomics (CMG), Clinical Sequencing Evidence-Generating Research (CSER), Electronic Medical Records and Genomics (eMERGE), Gabriella Miller Kids First (GMFK), Genomics Research to Elucidate the Genetics of Rare Diseases (GREGoR), The Genotype-Tissue Expression Project (GTEx), The Human Pangenome Reference Consortium (HPRC), Population Architecture Using Genomics and Epidemiology (PAGE), Undiagnosed Disease Network (UDN), UK Biobank*, None, or Other (with free text entry if Other is selected)." +Have you attended a monthly AnVIL Demo? (Example) AnVILDemo character "This question was given to every survey taker. Possible answers include Yes, multiple; Yes, one; No, but aware of; No, didn't know of; Not yet, but am registered to." +Have you ever read or posted in our AnVIL Support Forum? (Select all that apply) AnVILSupportForum character "This question was given to every survey taker and more than one answer could be selected. If so, the answers would be separated by commas. Possible answers include Read through others' posts, Posted in, Answered someone's post, No, but aware of, No, didn't know of. Note that the No, but and No, didn't are part of the same answer and so need to do some data cleaning there to remove the commas before the rest of data cleaning." +Please rank how/where you would prefer to attend AnVIL training workshops. [On-site at my institution] AnVILTrainingWorkshopsOnSite list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)." +Please rank how/where you would prefer to attend AnVIL training workshops. [Virtual] AnVILTrainingWorkshopsVirtual list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)." +"Please rank how/where you would prefer to attend AnVIL training workshops. [Conference (e.g., CSHL, AMIA)]" AnVILTrainingWorkshopsConference list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)." +Please rank how/where you would prefer to attend AnVIL training workshops. [AnVIL-specific event] AnVILTrainingWorkshopsSpecEvent list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)." +Please rank how/where you would prefer to attend AnVIL training workshops. [Other] AnVILTrainingWorkshopsOther list "This question was given to every survey taker and possible answers include 1 (Most preferred in this list), 2, 3, 4, 5 (Least preferred in this list)." +"If you ranked \""Other\"" highly for venues you would prefer to attend for training workshops, please specify here." OtherExamplesAnVILTrainingWorkshops character "This question was given to every survey taker and was free response. Some responses are NA or na, either entered by the responder or due to the responder not providing an answer." +Where do you currently run analyses? (Select all that apply) WhereAnalysesRun character "This question was given to every survey taker and multiple answer choices could be selected. If multiple answers were selected, the answers will be comma separated. Possible answers include Personal Computer (locally), Institutional High Performance Computing cluster (HPC), Amazon Web Services (AWS), Google Cloud Platform (GCP), Microsoft Azure, Galaxy (usegalaxy.org), or Other (with free text entry if Other is selected)" +"What repositories are you considering to share data (for example, to comply with the NIH DMS Policy)? (Select all that apply)" RepositoriesDMS character "This question was given to every survey taker and multiple answers could be selected. If multiple answers were selected, the answers will be comma separated. Possible answers include Your institutional repository, Zenodo, AnVIL, I'm not sure what DMS policy is, or Other (with free text entry if Other is selected)." +What source(s) of funds do you use to pay for cloud computing? (Select all that apply) FundingSources character "This question was given to all survey takers and multiple answers could be selected. If multiple answers were selected, the answers will be comma separated. Possible answers include NHGRI, Other NIH, Foundation Grant, Institutional funds, Don't know, Only use free options, Other (with free text entry if Other is selected)" +Are you willing to be contacted again to give input in the future? ContactWillingness character This question was given to every survey taker. Possible answers include Yes or No. +What email address may we use to contact you in the future? Email character This question was given only to survey takers who responded yes when asked if they were willing to be contacted in the future. It was free text response with the responder providing their email. NA corresponds to survey responses where the question wasn't asked. diff --git a/data/controlledAccessData_codebook.txt b/data/controlledAccessData_codebook.txt new file mode 100644 index 0000000..f29d5de --- /dev/null +++ b/data/controlledAccessData_codebook.txt @@ -0,0 +1,16 @@ +Dataset whichControlledAccess AnVIL_Availability +All of Us* All of Us non-AnVIL hosted +UK Biobank* UK Biobank non-AnVIL hosted +Centers for Common Disease Genomics (CCDG) CCDG AnVIL hosted +The Centers for Mendelian Genomics (CMG) CMG AnVIL hosted +Clinical Sequencing Evidence-Generating Research (CSER) CSER AnVIL hosted +Electronic Medical Records and Genomics (eMERGE) eMERGE AnVIL hosted +Gabriella Miller Kids First (GMKF) GMKF non-AnVIL hosted +Genomics Research to Elucidate the Genetics of Rare Diseases GREGoR AnVIL hosted +The Genotype-Tissue Expression Project (GTEx GTEx AnVIL hosted +The Human Pangenome Reference Consortium (HPRC) HPRC AnVIL hosted +Population Architecture Using Genomics and Epidemiology (PAGE) PAGE AnVIL hosted +Undiagnosed Disease Network (UDN) UDN non-AnVIL hosted +Being able to pull other dbGap data as needed., Cancer omics datasets Other NA +GnomAD and ClinVar None NA +TCGA TCGA non-AnVIL hosted diff --git a/data/institution_codebook.txt b/data/institution_codebook.txt new file mode 100644 index 0000000..8336190 --- /dev/null +++ b/data/institution_codebook.txt @@ -0,0 +1,30 @@ +InstitutionalAffiliation CorrespondingSurveyEntries InstitutionalType +Arizona State University Arizona State University R1 University +Baylor College of Medicine Baylor College of Medicine Medical Center or School +Boston Children's Hospital Boston Children's Hospital Medical Center or School +Broad Institute "Broad, broad institute, Broad Institute" Research Center +Carnegie Institution of Washington Carnegie Institution of Washington Research Center +City University of New York "City University of New York, CUNY School of Public Health; Roswell Park Comprehensive Cancer Center" R2 University +Clovis Community College Clovis Community College Community College +Columbia University Irving Medical Center Columbia University Irving Medical Center R1 University +Harvard University "harvard, Harvard Public Health" R1 University +Johns Hopkins "Johns hopkins, Johns Hopkins, Johns Hopkins University" R1 University +Lieber Institute for Brain Development Lieber Institute for Brain Development Research Center +Moffitt Cancer Center Moffitt Cancer Center Medical Center or School +NHGRI NHGRI NIH +Oregon Health & Science University "OHSU, OHSU (Knight Center)" R1 University +Pacific Biosciences Pacific Biosciences Industry +Penn State University Penn State University R1 University +Stanford University Stanford University R1 University +Ohio State University The Ohio State University R1 University +University of California Santa Cruz "UCSC, univ. ca. santa cruz, university of California santa cruz " R1 University +UMass Chan Medical School "Umass Chan Medical School, UMASS Chan Medical School" Medical Center or School +University of Minnesota University of Minnesota R1 University +University of Queensland University of Queensland International Location +University of Texas at El Paso University of Texas at El Paso R1 University +University of Virginia University of Virginia R1 University +University of Washington University of Washington R1 University +Vanderbilt University Medical Center Vanderbilt University Medical Center R1 University +Washington University in St. Louis "Washington University in St. Louis, Washington University in St Louis" R1 University +Yikon Genomics yikongene Industry +Unknown v Unknown \ No newline at end of file diff --git a/docs/1PANHDY3T9wpEX1GBYcqkdYMG0c20UeZPH2pwjHFcLik.pptx b/docs/1PANHDY3T9wpEX1GBYcqkdYMG0c20UeZPH2pwjHFcLik.pptx new file mode 100644 index 0000000..02e9828 Binary files /dev/null and b/docs/1PANHDY3T9wpEX1GBYcqkdYMG0c20UeZPH2pwjHFcLik.pptx differ diff --git a/docs/hosting.html b/docs/CurrentUserQs.html similarity index 66% rename from docs/hosting.html rename to docs/CurrentUserQs.html index 6d6e522..bfdbca9 100644 --- a/docs/hosting.html +++ b/docs/CurrentUserQs.html @@ -11,10 +11,10 @@ -Hosting +Current User Specific Questions - - + + @@ -32,8 +32,8 @@ - - + + @@ -61,7 +61,6 @@ - @@ -87,9 +86,6 @@ summary { display: list-item; } -details > summary > p:only-child { - display: inline; -} pre code { padding: 0; } @@ -146,15 +142,11 @@ href = "index.html"; var menuAnchor = $('a[href="' + href + '"]'); - // mark the anchor link active (and if it's in a dropdown, also mark that active) - var dropdown = menuAnchor.closest('li.dropdown'); - if (window.bootstrap) { // Bootstrap 4+ - menuAnchor.addClass('active'); - dropdown.find('> .dropdown-toggle').addClass('active'); - } else { // Bootstrap 3 - menuAnchor.parent().addClass('active'); - dropdown.addClass('active'); - } + // mark it active + menuAnchor.tab('show'); + + // if it's got a parent navbar menu mark it active as well + menuAnchor.closest('li.dropdown').addClass('active'); // Navbar adjustments var navHeight = $(".navbar").first().height() + 15; @@ -183,8 +175,8 @@ border-radius: 4px; } -.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before { - content: "\e259"; +.tabset-dropdown > .nav-tabs > li.active:before { + content: ""; font-family: 'Glyphicons Halflings'; display: inline-block; padding: 10px; @@ -192,11 +184,18 @@ } .tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before { - content: "\e258"; - font-family: 'Glyphicons Halflings'; + content: ""; border: none; } +.tabset-dropdown > .nav-tabs.nav-tabs-open:before { + content: ""; + font-family: 'Glyphicons Halflings'; + display: inline-block; + padding: 10px; + border-right: 1px solid #ddd; +} + .tabset-dropdown > .nav-tabs > li.active { display: block; } @@ -238,12 +237,12 @@