diff --git a/anvilPoll2024MainAnalysis.Rmd b/anvilPoll2024MainAnalysis.Rmd index 9b304d9..cdd738e 100644 --- a/anvilPoll2024MainAnalysis.Rmd +++ b/anvilPoll2024MainAnalysis.Rmd @@ -117,6 +117,7 @@ ggsave(here("plots/degree_furthersimplified_usertype.png")) #set plot size - Use this information (together with other info?) to try to cluster respondents/users into personas; see `PersonaStats.Rmd` + ### Prepare and plot the data
Description of variable definitions and steps @@ -320,7 +321,9 @@ PlotToolKnowledge_avg_score <- geom_point(aes(color = UserType, shape = AnVILorNo)) -PlotToolKnowledge_customization(PlotToolKnowledge_avg_score) +PlotToolKnowledge_avg_score %<>% PlotToolKnowledge_customization() + +PlotToolKnowledge_avg_score ggsave(here("plots/tooldataresourcecomfortscore_singlepanel.png"), w = 2200, h = 1350, units = "px") ``` @@ -461,20 +464,16 @@ We adjust various aspects of the theme like turning off the grid and background genomicsExpPlot <- ggplot(experienceDf, aes(x=experienceLevel,y=n, fill = experienceLevel)) + facet_grid(~researchType) + geom_bar(stat="identity") + - theme_bw() + - theme(panel.background = element_blank(), panel.grid = element_blank()) + - theme(axis.text.x = element_text(angle = 45, hjust=1)) + geom_text( aes(label = after_stat(y), group = experienceLevel), stat = 'summary', fun = sum, vjust = -0.5, size=2 ) + - ylab("Count") + xlab ("Reported Experience Level") + coord_cartesian(clip = "off") + theme(plot.margin = margin(1,1,1,1.05, "cm")) + - scale_fill_manual(values = c("#035C94","#035385","#024A77","#024168", "#02395B")) + - theme(legend.position = "none")+ ggtitle("How much experience do you have analyzing the following data categories?") +genomicsExpPlot %<>% stylize_bar(usertypeColor = FALSE, sequentialColor = TRUE, ylabel = "Count", xlabel = "Reported Experience Level", rotate=55, hjustv = 1) + genomicsExpPlot ggsave(here("plots/researchExperienceLevel_sequentialColor_noUserTypeSplit.png")) #set plot size @@ -482,7 +481,9 @@ ggsave(here("plots/researchExperienceLevel_sequentialColor_noUserTypeSplit.png") ## Experience: Controlled Access Datasets -**Takeaway:** Over half of respondents report they are extremely interested in working with controlled access datasets. +**Takeaway:** Generally, over half of respondents report they are extremely interested in working with controlled access datasets. + +For specific controlled access datasets ... - Of the survey provided choices, respondents have accessed or are particularly interested in accessing [All of Us](https://www.researchallofus.org/), [UK Biobank](https://www.ukbiobank.ac.uk/enable-your-research/about-our-data), and [GTEx](https://anvilproject.org/data/consortia/GTEx) (though All of Us and UK Biobank are not currently AnVIL hosted). - 2 respondents (moderately or extremely experienced with genomic data) specifically wrote in ["TCGA"](https://www.cancer.gov/ccg/research/genome-sequencing/tcga). @@ -515,6 +516,40 @@ Since this is a select all that apply question, we expect that there will be mul
Description of variable definitions and steps for preparing the data +
+ +```{r} +dataInterest <- resultsTidy %>% + group_by(InterestControlledData) %>% + summarize(count = n()) +``` + +
Description of variable definitions and steps for preparing bar plot + +
+ +```{r} +dataInterestPlot <- dataInterest %>% + ggplot(aes(x = InterestControlledData, + y = count, + fill = as.factor(InterestControlledData))) + + geom_bar(stat="identity") + + ggtitle("How interested are you in working with controlled access datasets?") + + coord_cartesian(clip = "off") + + theme(plot.margin = margin(1,1,1,1.1, "cm")) + + annotation_custom(textGrob("Extremely\ninterested", gp=gpar(fontsize=8, fontface = "bold")),xmin=5,xmax=5,ymin=-3.5,ymax=-3.5) + + annotation_custom(textGrob("Not at all\ninterested", gp=gpar(fontsize=8, fontface= "bold")),xmin=1,xmax=1,ymin=-3.5,ymax=-3.5) + + scale_y_continuous(breaks= pretty_breaks()) + + geom_text(aes(label = count, group = InterestControlledData), + vjust = -1, size=2) + +dataInterestPlot %<>% stylize_bar(usertypeColor = FALSE, sequentialColor = TRUE, xlabel = "Interest level", ylabel = "Count") + +dataInterestPlot +``` + +
Description of variable definitions and steps for preparing the data + Using a function `prep_df_whichData()` which is in the `shared_functions.R` script since we'll be using this workflow a few times for different subsets of the data, because we want to be able to differentially display the data based on the experience status (experienced with clinical research, human genomics research, etc.) of the person saying they'd like access to the data. We want to color the bars based on whether or not the controlled access dataset is available on the AnVIL currently. We create a dataframe `onAnVILDF` to report this. Used the [AnVIL dataset catalog/browser](https://explore.anvilproject.org/datasets) to find out this information. However, HPRC and GREGoR don't show up in that resource, but are both available per these sources: [Announcement for HPRC](https://anvilproject.org/news/2021/03/11/hprc-on-anvil), [Access for HPRC](https://anvilproject.org/data/consortia/HPRC), [Access for GREGoR](https://anvilproject.org/data/consortia/GREGoR). Both GMKF and TCGA are data hosted on other NCPI platforms that are accessible via AnVIL because of interoperability. (See: https://www.ncpi-acc.org/ and https://ncpi-data.org/platforms). We list these as non-AnVIL hosted since while accessible, they are not AnVIL hosted and inaccessible without NCPI. Finally, UDN is described as non-AnVIL hosted as it is in the Data submission pipeline and not yet available. @@ -651,7 +686,9 @@ demoPlotRaw <- resultsTidy %>% geom_bar(stat = "identity") + ggtitle("Have you attended a monthly AnVIL Demo?") -stylize_bar(demoPlotRaw) +demoPlotRaw %<>% stylize_bar() + +demoPlotRaw ``` #### Responses recoded to focus on awareness @@ -666,8 +703,9 @@ demoPlot <- resultsTidy %>% geom_bar(stat = "identity") + ggtitle("Have you attended a monthly AnVIL Demo?") -stylize_bar(demoPlot) + - ylab("Awareness") +demoPlot %<>% stylize_bar(ylabel = "Awareness") + +demoPlot ``` ## Awareness: AnVIL Support Forum @@ -716,7 +754,9 @@ forumPlotRaw <- ggplot(forumdf, geom_bar(stat = "identity") + ggtitle("Have you ever read or posted in our AnVIL Support Forum?") -stylize_bar(forumPlotRaw) +forumPlotRaw %<>% stylize_bar() + +forumPlotRaw ``` #### Responses recoded to focus on awareness @@ -726,8 +766,9 @@ forumPlot <- ggplot(forumdf, aes(y = forumAwareness, x = count, fill = UserType) geom_bar(stat = "identity") + ggtitle("Have you ever read or posted in our AnVIL Support Forum?") -stylize_bar(forumPlot) + - ylab("Awareness") +forumPlot %<>% stylize_bar(ylabel = "Awareness") + +forumPlot ``` ## Preferences: Feature Importance Ranking @@ -903,18 +944,10 @@ toPlotTrainingRanks <- bind_rows( tdumbbell <- ggplot(toPlotTrainingRanks, aes(x = avgRank, y = reorder(TrainingType, -avgRank))) + geom_line() + geom_point(aes(color = UserType), size = 3) + - theme(panel.background = element_blank()) + theme_bw() + theme(legend.position = "bottom") + - xlab("Average Rank") + - ylab("Training Workshop Modality") + - ggtitle("Please rank how/where you would prefer to attend\nAnVIL training workshops.") + - scale_color_manual(values = c("#E0DD10", "#035C94")) + - theme(legend.title=element_blank()) + - scale_x_continuous(breaks = 5:1, labels = 5:1, limits = c(1,5))+ - scale_x_reverse(limits = c(5,1)) + - annotation_custom(textGrob("Most\npreferred", gp=gpar(fontsize=8, fontface = "bold")),xmin=-1,xmax=-1,ymin=-0.5,ymax=-0.5) + - annotation_custom(textGrob("Least\npreferred", gp=gpar(fontsize=8, fontface= "bold")),xmin=-5,xmax=-5,ymin=-0.5,ymax=-0.5) + - coord_cartesian(clip = "off") + - theme(plot.margin = margin(1,1,1,1.1, "cm")) + ggtitle("Please rank how/where you would prefer to attend\nAnVIL training workshops.") + + +tdumbbell %<>% stylize_dumbbell(preference = TRUE, xlabel = "Average Rank", ylabel = "Training Workshop Modality", xmax=5) tdumbbell @@ -955,8 +988,10 @@ whereRunPlot <- resultsTidy %>% geom_bar(stat="identity") + ggtitle("Where do you currently run analyses?") -stylize_bar(whereRunPlot) + - ylab("Platform") +whereRunPlot %<>% stylize_bar(ylabel = "Platform") + +whereRunPlot + ``` ## Preferences: DMS compliance/data repositories @@ -1006,19 +1041,15 @@ toPlotFundingSource <- resultsTidy %>% separate(FundingSources, c("WhichA", "Whi ```{r} -toPlotFundingSource %>% ggplot(aes(y = reorder(whichFundingSource,count), x = count, fill = UserType)) + +fundingSourcePlot <- toPlotFundingSource %>% ggplot(aes(y = reorder(whichFundingSource,count), x = count, fill = UserType)) + geom_bar(position = "stack", stat = "identity") + - scale_fill_manual(values = c("#E0DD10", "#035C94")) + - theme_bw() + - ggtitle("What source(s) of funds do you use to pay for cloud computing?") + - xlab("Count") + - ylab("Funding Source") + - theme(panel.background = element_blank(), - panel.grid.minor.x = element_blank(), - panel.grid.minor.y = element_blank(), - panel.grid.major.y = element_blank()) - -ggsave(here("plots/fundingsources.png")) #set save size + ggtitle("What source(s) of funds do you use to pay for cloud computing?") + +fundingSourcePlot %<>% stylize_bar(ylabel="Funding Source") + +fundingSourcePlot + +ggsave(here("plots/fundingsources.png"), plot = fundingSourcePlot) #set save size ``` ## Returning User: Length of Use of the AnVIL @@ -1038,10 +1069,10 @@ timeUsePlot <- resultsTidy %>% vjust = -1, size=2) + ggtitle("How long have you been using the AnVIL?") -stylize_bar(timeUsePlot, usertypeColor = FALSE, singleColor = TRUE) + - xlab("Years of Use") + - ylab("Count") + - theme(legend.position = "none") +timeUsePlot %<>% stylize_bar(usertypeColor = FALSE, singleColor = TRUE, xlabel = "Years of Use", ylabel = "Count") + +timeUsePlot + ``` ## Returning User: Foreseeable Computational Needs @@ -1065,8 +1096,9 @@ compNeedsPlot <- resultsTidy %>% geom_bar(stat = "identity") + ggtitle("What computational and storage resources do you foresee\nneeding in the next 12 months?") - stylize_bar(compNeedsPlot, usertypeColor = FALSE, singleColor = TRUE) + - theme(legend.position = "none") +compNeedsPlot %<>% stylize_bar(usertypeColor = FALSE, singleColor = TRUE) + +compNeedsPlot ``` @@ -1092,10 +1124,10 @@ recLikePlot <- resultsTidy %>% geom_text(aes(label = count, group = RecommendationLikelihood), vjust = -1, size=2) - stylize_bar(recLikePlot, usertypeColor = FALSE, sequentialColor = TRUE) + - xlab("Recommendation likelihood") + - ylab("Count") + - theme(legend.position = "none") +recLikePlot %<>% stylize_bar(usertypeColor = FALSE, sequentialColor = TRUE, xlabel = "Recommendation likelihood", ylabel = "Count") + +recLikePlot + ``` ## Session Info diff --git a/pages/Awareness.Rmd b/pages/Awareness.Rmd index 0726e6d..9010d92 100644 --- a/pages/Awareness.Rmd +++ b/pages/Awareness.Rmd @@ -4,11 +4,10 @@ output: html_document --- ```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE} -library(tidyverse) library(here) -# Inherit `resultsTidy` -knitr::knit_child(here("TidyData.Rmd")) +# Inherit plots +knitr::knit_child(here("anvilPoll2024MainAnalysis.Rmd")) # Import functions to stylize plots source(here("resources/scripts/shared_functions.R")) ``` @@ -20,94 +19,33 @@ source(here("resources/scripts/shared_functions.R")) ## Raw responses ```{r echo=FALSE, message=FALSE} -demoPlotRaw <- resultsTidy %>% - group_by(UserType, AnVILDemo) %>% - summarize(count = n()) %>% - ggplot(aes(y=reorder(AnVILDemo, count), - x = count, - fill = UserType)) + - geom_bar(stat = "identity") + - ggtitle("Have you attended a monthly AnVIL Demo?") - -stylize_bar(demoPlotRaw) +demoPlotRaw ``` ## Awareness ```{r, message=FALSE, echo = FALSE} -demoPlot <- resultsTidy %>% - group_by(UserType, AnVILDemoAwareness) %>% - summarize(count = n()) %>% - ggplot(aes(y = AnVILDemoAwareness, - x = count, - fill = UserType)) + - geom_bar(stat = "identity") + - ggtitle("Have you attended a monthly AnVIL Demo?") - -stylize_bar(demoPlot) + - ylab("Awareness") - +demoPlot ``` ## Takeaway -Most respondents have not attended an AnVIL Demo. To investigate whether this is an awareness issue, we aggregated all responses except `No, didn't know of`. We see that the majority of respondents are aware of AnVIL Demos. These responses are just distributed among different ways of utilizing the demos. Further, there's awareness among both current and potential AnVIL users. +Most respondents have not attended an AnVIL Demo. To investigate whether this is an awareness issue, we aggregated all responses except `No, didn't know of`. We see that the majority of respondents are aware of AnVIL Demos. These responses are just distributed among different ways of utilizing the demos. Further, there's awareness among both current and potential AnVIL users.
# AnVIL Support Forum -```{r, message=FALSE, echo=FALSE} -forumdf <- resultsTidy %>% - mutate(AnVILSupportForum = str_replace(AnVILSupportForum, - pattern = "No, ", - replacement= "No ")) %>% - separate(AnVILSupportForum, - c("forumInteractionA", "forumInteractionB", "forumInteractionC"), - sep = ", ", - fill = "right") %>% - pivot_longer(starts_with("forumInteraction"), values_to = "forumInteractionDescription") %>% - group_by(UserType, CurrentUsageDescription, forumInteractionDescription) %>% - summarize(count = n()) %>% - drop_na() %>% - mutate(forumInteractionDescription = - factor(forumInteractionDescription, levels = c("Posted in", "Answered someone's post", "Read through others' posts", "No but aware of", "No didn't know of")), - forumAwareness = factor( - case_when( - forumInteractionDescription == "Posted in" ~ "Aware of", - forumInteractionDescription == "Answered someone's post" ~ "Aware of", - forumInteractionDescription == "Read through others' posts" ~ "Aware of", - forumInteractionDescription == "No but aware of" ~ "Aware of", - forumInteractionDescription == "No didn't know of" ~ "Not Aware of" - ), levels = c("Not Aware of", "Aware of")) -) -``` - ## Raw Responses ```{r, message=FALSE, echo=FALSE} -forumPlotRaw <- ggplot(forumdf, - aes(y = reorder(forumInteractionDescription, count), - x = count, - fill = UserType)) + - geom_bar(stat = "identity") + - ggtitle("Have you ever read or posted in our AnVIL Support Forum?") - -stylize_bar(forumPlotRaw) +forumPlotRaw ``` ## Awareness ```{r, message=FALSE, echo=FALSE} -forumPlot <- ggplot(forumdf, - aes(y = forumAwareness, - x = count, - fill = UserType)) + - geom_bar(stat = "identity") + - ggtitle("Have you ever read or posted in our AnVIL Support Forum?") - -stylize_bar(forumPlot) + - ylab("Awareness") +forumPlot ``` ## Takeaway diff --git a/pages/CurrentUserQs.Rmd b/pages/CurrentUserQs.Rmd index a8449d5..d7cf070 100644 --- a/pages/CurrentUserQs.Rmd +++ b/pages/CurrentUserQs.Rmd @@ -6,13 +6,10 @@ output: html_document --- ```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE} -library(tidyverse) library(here) -library(grid) #for Grobs -library(scales) #pretty breaks -# Inherit `resultsTidy` -knitr::knit_child(here("TidyData.Rmd")) +# Inherit plots +knitr::knit_child(here("anvilPoll2024MainAnalysis.Rmd")) # Import functions to stylize plots source(here("resources/scripts/shared_functions.R")) ``` @@ -22,22 +19,7 @@ source(here("resources/scripts/shared_functions.R")) # Length of Use of the AnVIL ```{r, message = FALSE, echo = FALSE} -timeUsePlot <- resultsTidy %>% - group_by(LengthOfUse) %>% - summarize(count = n()) %>% - drop_na() %>% - ggplot(aes(x = LengthOfUse, - y = count, - fill = "#25445A")) + - geom_bar(stat = "identity") + - geom_text(aes(label = count, group = LengthOfUse), - vjust = -1, size=2) + - ggtitle("How long have you been using the AnVIL?") - -stylize_bar(timeUsePlot, usertypeColor = FALSE, singleColor = TRUE) + - xlab("Years of Use") + - ylab("Count") + - theme(legend.position = "none") +timeUsePlot ``` ## Takeaway @@ -49,24 +31,7 @@ We observe a fairly even sampling of the current users with regards to the numbe # Foreseeable Computational Needs ```{r, message=FALSE, echo=FALSE, warning=FALSE} -compNeedsPlot <- resultsTidy %>% - separate(NeededResources, - c("whichResourceA", "whichResourceB", "whichResourceC", "whichResourceD"), - sep = ", ", fill = "right") %>% - pivot_longer(starts_with("whichResource"), values_to = "ResourceDescription") %>% - group_by(ResourceDescription) %>% - summarize(count = n()) %>% - drop_na() %>% - ggplot(aes(x = count, - y = reorder(ResourceDescription, count), - fill = "#25445A")) + - geom_text(aes(label = count, group = ResourceDescription), - hjust = -1, size=2) + - geom_bar(stat = "identity") + - ggtitle("What computational and storage resources do you foresee\nneeding in the next 12 months?") - - stylize_bar(compNeedsPlot, usertypeColor = FALSE, singleColor = TRUE) + - theme(legend.position = "none") +compNeedsPlot ``` ## Takeaway @@ -82,27 +47,7 @@ Of the `r nrow(resultsTidy %>% filter(UserType == "Current User"))` current user # Recommendation Likelihood ```{r, message = FALSE, echo = FALSE} -recLikePlot <- resultsTidy %>% - group_by(RecommendationLikelihood) %>% - summarize(count = n()) %>% - drop_na() %>% #not asked to everyone - ggplot(aes(x = RecommendationLikelihood, - y = count, - fill = as.factor(RecommendationLikelihood))) + - geom_bar(stat="identity") + - ggtitle("How likely are you to recommend the AnVIL to a colleague?") + - coord_cartesian(clip = "off") + - theme(plot.margin = margin(1,1,1.2,1, "cm")) + - annotation_custom(textGrob("Extremely likely", gp=gpar(fontsize=8, fontface = "bold")),xmin=5,xmax=5,ymin=-1.25,ymax=-1.25) + - annotation_custom(textGrob("Not at all likely", gp=gpar(fontsize=8, fontface= "bold")),xmin=1,xmax=1,ymin=-1.25,ymax=-1.25) + - scale_y_continuous(breaks= pretty_breaks()) + - geom_text(aes(label = count, group = RecommendationLikelihood), - vjust = -1, size=2) - - stylize_bar(recLikePlot, usertypeColor = FALSE, sequentialColor = TRUE) + - xlab("Recommendation likelihood") + - ylab("Count") + - theme(legend.position = "none") +recLikePlot ``` ## Takeaway diff --git a/pages/Demographics.Rmd b/pages/Demographics.Rmd index 329fb4e..7800766 100644 --- a/pages/Demographics.Rmd +++ b/pages/Demographics.Rmd @@ -6,14 +6,11 @@ output: html_document --- ```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE} -library(tidyverse) library(here) -library(grid) #for Grobs -library(scales) #pretty breaks library(kableExtra) -# Inherit `resultsTidy` -knitr::knit_child(here("TidyData.Rmd")) +# Inherit plots +knitr::knit_child(here("anvilPoll2024MainAnalysis.Rmd")) # Import functions to stylize plots source(here("resources/scripts/shared_functions.R")) ``` @@ -23,22 +20,7 @@ source(here("resources/scripts/shared_functions.R")) # Highest Degree ```{r, message=FALSE, echo=FALSE} -degreePlot <- resultsTidy %>% - group_by(FurtherSimplifiedDegrees, UserType) %>% - summarize(n = n()) %>% - ggplot(aes(y = reorder(FurtherSimplifiedDegrees, n, sum), - x = n, - fill = UserType - )) + - geom_bar(position = "stack", stat="identity") + - geom_text( - aes(label = after_stat(x), group = FurtherSimplifiedDegrees), - stat = 'summary', fun = sum, hjust = -1, size=2 - ) + - coord_cartesian(clip = "off") + - ggtitle("What is the highest degree you have attained?") - -stylize_bar(degreePlot) +degreePlot ``` ## Takeaway @@ -50,37 +32,7 @@ Most of the respondents have a PhD or are currently working on a PhD, though a r # Kind of Work ```{r, message = FALSE, echo = FALSE} -dfForPlotKOW <- resultsTidy %>% - separate(KindOfWork, - c("whichWorkA", "whichWorkB", "whichWorkC", "whichWorkD", "whichWorkE", "whichWorkF", "whichWorkG", "whichWorkH", "whichWorkI", "whichWorkJ"), - sep=", ", fill="right") %>% - pivot_longer(starts_with("whichWork"), values_to = "whichWorkDescription") %>% - select(Timestamp, UserType, whichWorkDescription) %>% - mutate(whichWorkDescription = - recode(whichWorkDescription, - "computational education" = "Computational education", - "Program administration," = "Program administration"), - whichWorkDescription = factor(whichWorkDescription), - Timestamp = factor(Timestamp) - ) %>% - drop_na() - -factorLevel <- as.data.frame(table(dfForPlotKOW$whichWorkDescription)) %>% arrange(-Freq) %>% select(Var1) %>% unlist() %>% unname() %>% rev() - -ggplot(dfForPlotKOW, - aes(x = Timestamp, - y = factor(whichWorkDescription, levels = factorLevel), - fill = whichWorkDescription - )) + - geom_tile() + - theme_bw() + - theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank(), - legend.position = "None") + - ylab("") + - ggtitle("What kind of work do you do?") + - xlab("Respondent") + - facet_wrap(~UserType) +kowPlot ``` ## Takeaway @@ -96,29 +48,7 @@ Only a few responses report project management, leadership, or administration as # Institutional Affiliation ```{r, message=FALSE, echo = FALSE} -instPlot <- resultsTidy %>% - mutate(FurtherSimplifiedInstitutionalType = - factor(FurtherSimplifiedInstitutionalType, - levels = c("Industry & Other", "Education Focused", "Research Intensive"))) %>% - group_by(UserType, FurtherSimplifiedInstitutionalType) %>% - summarize(InstitutionalCount = n()) %>% - ggplot(aes( - y = FurtherSimplifiedInstitutionalType, - x = InstitutionalCount, - fill = UserType - )) + - geom_bar(position = "stack", stat = "identity") + - geom_text(aes(label = after_stat(x), - group = FurtherSimplifiedInstitutionalType), - stat = 'summary', fun = sum, hjust = -1, size=2 - ) + - annotation_custom(textGrob("- R1 University \n- Med Campus \n- Research Center\n- NIH ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = 2.65, ymax = 2.65) + - annotation_custom(textGrob("- Industry \n- International Loc\n- Unknown ", gp = gpar(fontsize = 8)), xmin = -8.5, xmax = -8.5, ymin = .7, ymax = .7) + - annotation_custom(textGrob("- R2 University \n- Community College", gp=gpar(fontsize=8)),xmin=-8.5,xmax=-8.5,ymin=1.75,ymax=1.75) + - coord_cartesian(clip = "off") + - ggtitle("What institution are you affiliated with?") - -stylize_bar(instPlot) +instTypePlot ``` ## Takeaway @@ -133,27 +63,8 @@ Most of the individuals using the AnVIL report being affiliated with a research # Consortia Affiliations -```{r, message=FALSE, echo = FALSE} -consortiaTable <- resultsTidy %>% - mutate(ConsortiaAffiliations = str_replace_all(ConsortiaAffiliations, c(";|&| and"), ",")) %>% - separate(ConsortiaAffiliations, - c("whichConsortiumA", "whichConsortiumB", "whichConsortiumC", "whichConsortiumD"), - sep=", ", fill = "right") %>% - pivot_longer(starts_with("whichConsortium"), values_to = "whichConsortiumName") %>% - group_by(whichConsortiumName) %>% - summarize(count = n()) %>% - drop_na() %>% - arrange(count) -``` - - Of `r nrow(resultsTidy)` responses, `r sum(!is.na(resultsTidy$ConsortiaAffiliations))` provide an affiliation, with `r nrow(consortiaTable)` unique affiliations represented across those responses (respondents could select more than one consortium). The following table shows the most represented consortia. ```{r, message = FALSE, echo = FALSE} -consortia_df <- - consortiaTable[which(consortiaTable$count >1),] %>% - rename(`consortium` = whichConsortiumName) - kableExtra::kable(consortia_df, table.attr = "style='width:20%;'") ``` - diff --git a/pages/Experience.Rmd b/pages/Experience.Rmd index 0faff9d..a0d254c 100644 --- a/pages/Experience.Rmd +++ b/pages/Experience.Rmd @@ -5,12 +5,10 @@ date: "" --- ```{r, message = FALSE, echo = FALSE,results='hide', warning=FALSE} -library(tidyverse) library(here) -library(grid) #for Grobs -# Inherits `resultsTidy` -knitr::knit_child(here("TidyData.Rmd")) +# Inherit plots +knitr::knit_child(here("anvilPoll2024MainAnalysis.Rmd")) # Import functions to stylize plots source(here("resources/scripts/shared_functions.R")) ``` @@ -20,71 +18,7 @@ source(here("resources/scripts/shared_functions.R")) # Tool & Resource Knowledge/Comfort level ```{r, message=FALSE, echo = FALSE} -toPlotToolKnowledge <- bind_rows( - resultsTidy %>% - filter(UserType == "Current User") %>% - select(starts_with("Score_")) %>% - colSums(na.rm = TRUE) %>% - as.data.frame() %>% `colnames<-`(c("totalScore")) %>% - mutate(nscores = sum(resultsTidy$UserType == "Current User"), - avgScore = totalScore / nscores, - UserType = "Current Users") %>% - mutate(WhereTool = rownames(.)) %>% - separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>% - mutate(AnVILorNo = - case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL", - AnVILorNo == "Score_All" ~ "Separate from the AnVIL" - ), - Tool = - recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks", - "WDL" = "Workflows", - "CommandLine" = "Unix / Command Line", - "AccessData" = "Access controlled access data", - "Terra" = "Terra Workspaces", - "BioconductorRStudio" = "Bioconductor & RStudio" - ) - ), - resultsTidy %>% - filter(UserType == "Potential User") %>% - select(starts_with("Score_AllTech")) %>% - colSums() %>% - as.data.frame() %>% `colnames<-`(c("totalScore")) %>% - mutate(nscores = sum(resultsTidy$UserType == "Potential User"), - avgScore = totalScore / nscores, - UserType = "Potential Users") %>% - mutate(WhereTool = rownames(.)) %>% - separate(WhereTool, c("AnVILorNo", "Tool"), sep = "Tech", remove = TRUE) %>% - mutate(AnVILorNo = - case_when(AnVILorNo == "Score_CurrentAnVIL" ~ "On the AnVIL", - AnVILorNo == "Score_All" ~ "Separate from the AnVIL" - ), - Tool = - recode(Tool, "JupyterNotebooks" = "Jupyter Notebooks", - "WDL" = "Workflows", - "CommandLine" = "Unix / Command Line", - "AccessData" = "Access controlled access data", - "Terra" = "Terra Workspaces", - "BioconductorRStudio" = "Bioconductor & RStudio" - ) - ) -) %>% - mutate(UserType = factor(UserType, levels = c("Potential Users", "Current Users"))) - -roi <- toPlotToolKnowledge[which(toPlotToolKnowledge$Tool == "Bioconductor & RStudio"),] -toPlotToolKnowledge <- rows_append(toPlotToolKnowledge, data.frame( - UserType = rep(roi$UserType,2), - avgScore = rep(roi$avgScore,2), - AnVILorNo = rep(roi$AnVILorNo,2), - Tool = c("Bioconductor", "RStudio") - )) %>% - rows_delete(., data.frame(roi)) -``` - -```{r message=FALSE, echo=FALSE} - -PlotToolKnowledge_customization(ggplot(toPlotToolKnowledge, - aes(y = reorder(Tool, avgScore), x = avgScore)) + - geom_point(aes(color = UserType, shape = AnVILorNo))) +PlotToolKnowledge_avg_score ``` ## Takeaways @@ -102,9 +36,7 @@ Overall, there is less comfort with containers or workflows than using various p # Types of data analyzed ```{r, message=FALSE, echo=FALSE} -resultsTidy %>% - prep_df_typeData() %>% - plot_type_data() +everyone_type_data ```
@@ -112,39 +44,7 @@ resultsTidy %>% # Genomics and Clinical Research Experience ```{r, message=FALSE, echo = FALSE} -experienceDf <- resultsTidy %>% - select(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience, UserType) %>% - pivot_longer(c(HumanGenomicExperience, HumanClinicalExperience, NonHumanGenomicExperience), - names_to = "researchType", - values_to = "experienceLevel") %>% - mutate(experienceLevel = - factor(experienceLevel, levels = c("Not at all experienced", "Slightly experienced", "Somewhat experienced", "Moderately experienced", "Extremely experienced")), - researchType = case_when( - researchType == "HumanClinicalExperience" ~ "Human Clinical Research", - researchType == "HumanGenomicExperience" ~ "Human Genomic Research", - researchType == "NonHumanGenomicExperience" ~ "Non-human\nGenomic Research") - ) %>% - group_by(researchType, experienceLevel, UserType) %>% - summarize(n = n()) - -ggplot(experienceDf, aes(x=experienceLevel,y=n, fill = experienceLevel)) + - facet_grid(~researchType) + - geom_bar(stat="identity") + - theme_bw() + - theme(panel.background = element_blank(), - panel.grid = element_blank()) + - theme(axis.text.x = element_text(angle = 45, hjust=1)) + - geom_text( - aes(label = after_stat(y), group = experienceLevel), - stat = 'summary', fun = sum, vjust = -0.5, size=2 - ) + - ylab("Count") + - xlab ("Reported Experience Level") + - coord_cartesian(clip = "off") + - theme(plot.margin = margin(1,1,1,1.05, "cm")) + - scale_fill_manual(values = c("#035C94","#035385","#024A77","#024168", "#02395B")) + - theme(legend.position = "none")+ - ggtitle("How much experience do you have analyzing\nthe following data categories?") +genomicsExpPlot ``` ## Takeaway @@ -160,26 +60,7 @@ ggplot(experienceDf, aes(x=experienceLevel,y=n, fill = experienceLevel)) + # General interest in controlled access datasets ```{r message=FALSE, echo=FALSE} -dataInterestPlot <- resultsTidy %>% - group_by(InterestControlledData) %>% - summarize(count = n()) %>% - ggplot(aes(x = InterestControlledData, - y = count, - fill = as.factor(InterestControlledData))) + - geom_bar(stat="identity") + - ggtitle("How interested are you in working with controlled access datasets?") + - coord_cartesian(clip = "off") + - theme(plot.margin = margin(1,1,1,1.1, "cm")) + - annotation_custom(textGrob("Extremely\ninterested", gp=gpar(fontsize=8, fontface = "bold")),xmin=5,xmax=5,ymin=-3.5,ymax=-3.5) + - annotation_custom(textGrob("Not at all\ninterested", gp=gpar(fontsize=8, fontface= "bold")),xmin=1,xmax=1,ymin=-3.5,ymax=-3.5) + - scale_y_continuous(breaks= pretty_breaks()) + - geom_text(aes(label = count, group = InterestControlledData), - vjust = -1, size=2) - - stylize_bar(dataInterestPlot, usertypeColor = FALSE, sequentialColor = TRUE) + - xlab("Interest level") + - ylab("Count") + - theme(legend.position = "none") +dataInterestPlot ``` ## Takeaway @@ -193,40 +74,25 @@ Over half of respondents report they are extremely interested in working with co ## All responses ```{r message=FALSE, echo=FALSE} -onAnVILDF <- read_delim(here("data/controlledAccessData_codebook.txt"), - delim = "\t", - col_select = c(whichControlledAccess, AnVIL_Availability)) - -resultsTidy %>% - prep_df_whichData(onAnVILDF = onAnVILDF) %>% - plot_which_data() +everyoneDataPlot ``` ## Just responses from those moderately or extremely experienced with clinical data ```{r message=FALSE, echo=FALSE} -resultsTidy %>% - filter(clinicalFlag == TRUE) %>% - prep_df_whichData(onAnVILDF = onAnVILDF) %>% - plot_which_data(subtitle = "Respondents moderately or extremely experienced with clinical data") +clinicalDataPlot ``` ## Just responses from those moderately or extremely experienced with human genomic data ```{r message=FALSE, echo=FALSE} -resultsTidy %>% - filter(humanGenomicFlag == TRUE) %>% - prep_df_whichData(onAnVILDF = onAnVILDF) %>% - plot_which_data(subtitle = "Respondents moderately or extremely experienced with human genomic data") +humanGenomicDataPlot ``` ## Just responses from those moderately or extremely experienced with non-human genomic data ```{r message=FALSE, echo=FALSE} -resultsTidy %>% - filter(nonHumanGenomicFlag == TRUE) %>% - prep_df_whichData(onAnVILDF = onAnVILDF) %>% - plot_which_data(subtitle = "Respondents moderately or extremely experienced with non-human genomic data") +nonHumanGenomicDataPlot ``` ## Takeaway diff --git a/pages/IdentifyTypeOfUsers.Rmd b/pages/IdentifyTypeOfUsers.Rmd index f96f83b..9efc1d2 100644 --- a/pages/IdentifyTypeOfUsers.Rmd +++ b/pages/IdentifyTypeOfUsers.Rmd @@ -16,26 +16,7 @@ source(here("resources/scripts/shared_functions.R")) ``` ```{r, message=FALSE, echo=FALSE} -topPlot <- resultsTidy %>% - group_by(UserType, CurrentUsageDescription) %>% - summarize(count = n()) %>% - mutate(CurrentUsageDescription = case_when( - CurrentUsageDescription == "For ongoing projects (e.g., consistent project development and/or work)" ~ "For ongoing projects:\nconsistent project development\nand/or work", - CurrentUsageDescription == "For completed/long-term projects (e.g., occasional updates/maintenance as needed)" ~ "For completed/long-term projects:\noccasional updates/maintenance\nas needed", - CurrentUsageDescription == "For short-term projects (e.g., short, intense bursts separated by a few months)" ~ "For short-term projects:\nshort, intense bursts\nseparated by a few months", - CurrentUsageDescription == "I do not currently use the AnVIL, but have in the past" ~ "I do not current use the AnVIL,\nbut have in the past", - CurrentUsageDescription == "I have never used the AnVIL, but have heard of it" ~ "I have never\nused the AnVIL", - CurrentUsageDescription == "I have never heard of the AnVIL" ~ "I have never\nheard of the AnVIL" - )) %>% - ggplot(aes(x = count, - y = reorder(CurrentUsageDescription, count), - fill = UserType)) + - geom_bar(stat="identity", position ="stack") + - ggtitle("How would you describe your current usage\nof the AnVIL platform?") + - geom_text(aes(label = count, group = CurrentUsageDescription), - hjust = -0.5, size=2) - -stylize_bar(topPlot) +typeOfUserPlot ``` ## Takeaway diff --git a/pages/Preferences.Rmd b/pages/Preferences.Rmd index 72ce9a2..c1c2ccc 100644 --- a/pages/Preferences.Rmd +++ b/pages/Preferences.Rmd @@ -17,54 +17,15 @@ source(here("resources/scripts/shared_functions.R")) # Feature Importance Ranking -```{r, message=FALSE, echo = FALSE} -totalRanksdf <- - bind_rows( - resultsTidy %>% - select(starts_with("PotentialRank")) %>% - colSums(na.rm = TRUE) %>% - as.data.frame() %>% `colnames<-`(c("totalRank")) %>% - mutate(nranks = sum(resultsTidy$UserType == "Potential User"), - avgRank = totalRank / nranks), - resultsTidy %>% - select(starts_with("CurrentRank")) %>% - colSums(na.rm = TRUE) %>% - as.data.frame() %>% `colnames<-`(c("totalRank")) %>% - mutate(nranks = sum(resultsTidy$UserType == "Current User"), - avgRank = totalRank /nranks) - ) %>% - mutate(UsertypeFeature = rownames(.)) %>% - separate(UsertypeFeature, c("Usertype", "Feature"), sep = "Rank", remove = TRUE) %>% - mutate(Feature = - case_when(Feature == "EasyBillingSetup" ~ "Easy billing setup", - Feature == "FlatRateBilling" ~ "Flat-rate billing rather than use-based", - Feature == "FreeVersion" ~ "Free version with limited compute or storage", - Feature == "SupportDocs" ~ "On demand support and documentation", - Feature == "ToolsData" ~ "Specific tools or datasets are available/supported", - Feature == "CommunityAdoption" ~ "Greater adoption of the AnVIL by the scientific community"), - Usertype = factor(case_when(Usertype == "Potential" ~ "Potential User", - Usertype == "Current" ~ "Current User"), levels = c("Potential User", "Current User")) - ) -``` - - ```{r message=FALSE, echo=FALSE} -gdumbbell <- ggplot(totalRanksdf, - aes(x = avgRank, - y = reorder(Feature, -avgRank))) + - geom_line() + - geom_point(aes(color = Usertype), size = 3) + - ggtitle("Rank the following features\naccording to their importance to\nyou as a potential user or for\nyour continued use of the AnVIL") - - -stylize_dumbbell(gdumbbell, xmax=6, importance = TRUE) +gdumbbell ``` ## Takeaways All respondents rate having specific tools or datasets supported/available as a very important feature for using AnVIL. Compared to current users, potential users rate having a free-version with limited compute or storage as the most important feature for their potential use of the AnVIL. -## Potential Follow-ups +## Potential Follow-ups - Ask what specific tools people want available/supported @@ -73,42 +34,7 @@ All respondents rate having specific tools or datasets supported/available as a # Training Workshop Modality Ranking ```{r, message=FALSE, echo=FALSE} -toPlotTrainingRanks <- bind_rows( - resultsTidy %>% - filter(UserType == "Current User") %>% - select(starts_with("AnVILTrainingWorkshops")) %>% - colSums(na.rm = TRUE) %>% - as.data.frame() %>% `colnames<-`(c("totalRank")) %>% - mutate(nranks = sum(resultsTidy$UserType == "Current User"), - avgRank = totalRank / nranks, - UserType = "Current User") %>% - mutate(TrainingType = rownames(.)) %>% - mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", "")), - resultsTidy %>% - filter(UserType == "Potential User") %>% - select(starts_with("AnVILTrainingWorkshops")) %>% - colSums() %>% - as.data.frame() %>% `colnames<-`(c("totalRank")) %>% - mutate(nranks = sum(resultsTidy$UserType == "Potential User"), - avgRank = totalRank / nranks, - UserType = "Potential User") %>% - mutate(TrainingType = rownames(.)) %>% - mutate(TrainingType = str_replace(TrainingType, "AnVILTrainingWorkshops", "")) - ) %>% mutate(TrainingType = recode(TrainingType, "SpecEvent" = "AnVIL-specific event", "OnSite" = "On-site at my institution", "Conference" = "Conference (e.g., CSHL, AMIA)")) %>% - mutate(UserType = factor(UserType, levels = c("Potential User", "Current User"))) - -``` - -```{r, message=FALSE, echo=FALSE} -tdumbbell <- ggplot(toPlotTrainingRanks, - aes(x = avgRank, - y = reorder(TrainingType, -avgRank))) + - geom_line() + - geom_point(aes(color = UserType), size = 3) + - - ggtitle("Please rank how/where you would prefer to attend\nAnVIL training workshops.") - -stylize_dumbbell(tdumbbell, xmax=5, preference = TRUE) +tdumbbell ``` ## Takeaway @@ -120,37 +46,12 @@ Both current and potential users vastly prefer virtual training workshops. # Where analyses are currently run ```{r message=FALSE, echo=FALSE} -whereRunPlot <- resultsTidy %>% - separate(WhereAnalysesRun, - c("whereRunA", "whereRunB", "whereRunC", "whereRunD", "whereRunE", "whereRunF", "whereRunG"), - sep = ", ", fill = "right") %>% - pivot_longer(starts_with("whereRun"), values_to = "wherePlatforms") %>% - mutate(wherePlatforms = - recode(wherePlatforms, - "Amazon Web Services (AWS)" = "AWS", - "Galaxy (usegalaxy.org)" = "Galaxy", - "Galaxy Australia" = "Galaxy", - "Google Cloud Platform (GCP)" = "GCP", - "Institutional High Performance Computing cluster (HPC)" = "Institutional HPC", - "Personal computer (locally)," = "Personal computer (locally)", - "local server" = "Institutional HPC") - ) %>% - group_by(UserType, wherePlatforms) %>% - summarize(count = n()) %>% - drop_na() %>% - ggplot(aes(x = count, - y = reorder(wherePlatforms, count), - fill = UserType)) + - geom_bar(stat="identity") + - ggtitle("Where do you currently run analyses?") - -stylize_bar(whereRunPlot) + - ylab("Platform") +whereRunPlot ``` ## Takeaways -Institutional HPC and locally/personal computers are the most common responses. Google Cloud Platform (GCP) is reported as used more than other cloud providers within this sample. We also see that potential users report using Galaxy (a free option) more than current users do. +Institutional HPC and locally/personal computers are the most common responses. Google Cloud Platform (GCP) is reported as used more than other cloud providers within this sample. We also see that potential users report using Galaxy (a free option) more than current users do.
@@ -163,26 +64,7 @@ TBA # Source for cloud computing funds ```{r message=FALSE, echo=FALSE} -plotFundingSource <- resultsTidy %>% - separate(FundingSources, - c("WhichA", "WhichB", "WhichC", "WhichD", "WhichE", "WhichF", "WhichG"), - sep = ", ", - fill="right") %>% - pivot_longer(starts_with("Which"), - names_to = "WhichChoice", - values_to = "whichFundingSource") %>% - drop_na(whichFundingSource) %>% - group_by(whichFundingSource, UserType) %>% - summarize(count = n()) %>% - ggplot(aes(y = reorder(whichFundingSource,count), - x = count, - fill = UserType)) + - geom_bar(position = "stack", stat = "identity") + - ggtitle("What source(s) of funds do you use to pay for cloud computing?") - -stylize_bar(plotFundingSource) + - ylab("Funding Source") - +fundingSourcePlot ``` ## Takeaway diff --git a/resources/scripts/shared_functions.R b/resources/scripts/shared_functions.R index 340b023..dc7a9c9 100644 --- a/resources/scripts/shared_functions.R +++ b/resources/scripts/shared_functions.R @@ -3,27 +3,30 @@ library(magrittr) library(tidyverse) -stylize_bar <- function(gplot, usertypeColor = TRUE, singleColor = FALSE, sequentialColor = FALSE){ +stylize_bar <- function(gplot, usertypeColor = TRUE, singleColor = FALSE, sequentialColor = FALSE, xlabel = "Count", ylabel = "", legendpos = "right", rotate = 0, hjustv = 0){ if (usertypeColor) { fillColors <- c("#E0DD10", "#035C94") } else if (singleColor){ fillColors <- c("#25445A") + legendpos = "none" + } else if (sequentialColor){ fillColors <- c("#035C94","#035385","#024A77","#024168", "#02395B") + legendpos = "none" } return( gplot + theme_classic() + - ylab("") + - xlab("Count") + - theme(legend.title = element_blank()) + + ylab(ylabel) + + xlab(xlabel) + + theme(legend.title = element_blank(), legend.position = legendpos, axis.text.x = element_text(angle=rotate, hjust=hjustv)) + scale_fill_manual(values = fillColors, na.translate = F) ) } -stylize_dumbbell <- function(gplot, xmax = NULL, importance = FALSE, preference = FALSE){ +stylize_dumbbell <- function(gplot, xmax = NULL, importance = FALSE, preference = FALSE, xlabel="Average Rank Choice", ylabel=""){ if (importance){ textGrobMost <- "Most\nimportant" textGrobLeast <- "Least\nimportant" @@ -38,8 +41,8 @@ stylize_dumbbell <- function(gplot, xmax = NULL, importance = FALSE, preference theme(panel.background = element_blank(), legend.position = "bottom", legend.title = element_blank()) + - xlab("Average Rank Choice") + - ylab("") + + xlab(xlabel) + + ylab(ylabel) + scale_color_manual(values = c("#E0DD10", "#035C94")) + coord_cartesian(clip = "off") + theme(plot.margin = margin(1,1,1,1.1, "cm")) + @@ -151,14 +154,16 @@ prep_df_typeData <- function(subset_df){ } plot_type_data <- function(inputToPlotDF, subtitle = NULL){ - toreturnplot <- ggplot(inputToPlotDF, aes(x = reorder(whichTypeData, -count), y = count)) + - geom_bar(stat="identity") + - theme_classic() + theme(panel.background = element_blank(), panel.grid = element_blank()) + - theme(axis.text.x = element_text(angle=45, hjust=1)) + - xlab("Types of data") + ylab("Count") + + toreturnplot <- ggplot(inputToPlotDF, aes(x = reorder(whichTypeData, -count), + y = count, + fill = "#25445A")) + + geom_bar(stat="identity") + ggtitle("What types of data do you or would you analyze using the AnVIL?", subtitle = subtitle) + - geom_text(aes(label = after_stat(y), group = whichTypeData), + geom_text(aes(label = after_stat(y), group = whichTypeData), stat = 'summary', fun = sum, vjust = -1, size=2) + coord_cartesian(clip = "off") + + toreturnplot %<>% stylize_bar(usertypeColor = FALSE, singleColor = TRUE, xlabel = "Types of data", ylabel = "Count", hjustv = 1, rotate=45) + return(toreturnplot) }