Merge pull request #189 from CDU-data-science-team/185-update-framework

185 update framework
The-Strategy-Unit · Oct 12, 2023 · cb8d1ac · cb8d1ac
2 parents d144a20 + 63007ac
commit cb8d1ac
Show file tree

Hide file tree

Showing 33 changed files with 347 additions and 252 deletions.
diff --git a/API_url_tracker.Rmd b/API_url_tracker.Rmd
@@ -7,15 +7,7 @@ output: html_document
 
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
-
-# install experience dashboard if not already install
-if ("experiencesdashboard" %in% installed.packages()) {
-  if ("remotes" %in% installed.packages()) install.packages("remotes", repos = "http://cran.r-project.org")
-  remotes::install_github("CDU-data-science-team/experiencesdashboard", upgrade = "never", quiet = TRUE)
-}
-
 library(experiencesdashboard)
-library(dplyr)
 ```
 
 # Last rendered at `r round(Sys.time())`
@@ -40,7 +32,7 @@ pending_jobs <- dplyr::tbl(
     "api_jobs"
   )
 ) |>
-  filter(status == "submitted") |>
+  dplyr::filter(status == "submitted") |>
   dplyr::collect()
 ```
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ export(api_pred)
 export(calculate_table)
 export(clean_dataframe)
 export(demographic_distribution)
+export(drop_na_for_col)
 export(get_api_pred_url)
 export(get_pred_from_url)
 export(html_decoder)

diff --git a/R/app_server.R b/R/app_server.R
@@ -50,7 +50,6 @@ app_server <- function(input, output, session) {
   latest_time <- api_jobs$latest_time
   wait_time <- api_jobs$estimated_wait_time
 
-
   if (!is.null(latest_time) & data_exists) {
     # filter out all the unfinished rows(api job time is same as last_upload_date when doing data upload)
     db_data <- db_data %>%
@@ -275,9 +274,10 @@ app_server <- function(input, output, session) {
         dplyr::arrange(date)
     }
 
-    # TRANSFORM THE SENTIMENT COLUMN
+    # Transform the sentiment column
     return_data <- return_data %>% 
-      transform_sentiment()
+      transform_sentiment() %>% 
+      drop_na_for_col(c('category', 'super_category', 'sentiment'))
 
     # also return a dataset with unique individuals
     unique_data <- return_data %>%

diff --git a/R/app_ui.R b/R/app_ui.R
@@ -7,9 +7,6 @@
 #' @noRd
 app_ui <- function(request) {
   tagList(
-
-    # Leave this function for adding external resources
-    golem_add_external_resources(),
     # List the first level UI elements here
     dashboardPage(
       dashboardHeader(
@@ -37,12 +34,9 @@ app_ui <- function(request) {
       ),
       dashboardBody(
         ### Changing theme ----
-        fresh::use_theme(nhs_shiny_theme()), # <-- use fresh object theme to style the whole dashboard
-
-        #### Add css file for buttons ----
-        includeCSS(system.file("app/www/", "button-style.css",
-          package = "experiencesdashboard"
-        )),
+        fresh::use_theme(nhs_shiny_theme()), # use fresh object theme to style the whole dashboard
+        # Leave this function for adding external resources
+        golem_add_external_resources(),
         HTML('<a href="https://github.com/CDU-data-science-team/experiencesdashboard"
              class="github-corner" aria-label="View source on GitHub"><svg width="80"
              height="80" viewBox="0 0 250 250" style="fill:#64CEAA; color:#fff; position:
@@ -69,7 +63,6 @@ app_ui <- function(request) {
         tabItems(
           tabItem(
             tabName = "experiences-user",
-            # h1("Service User Experiences"),
             mod_patient_experience_ui("patient_experience_ui_1")
           )
         )

diff --git a/R/fct_api_pred.R b/R/fct_api_pred.R
@@ -3,19 +3,24 @@
 #'
 #' @param data Dataframe with column `comment_id`, `comment_text` and `question_type`
 #' @param api_key api key to access the api
+#' @param target to determine the type of prediction, the options are `m` for multilabel,
+#' `s` for sentiment or `ms` for both. default to `ms`
 #' @return a dataframe or a url to get the data
 #' @export
-get_api_pred_url <- function(data, api_key) {
+get_api_pred_url <- function(data, api_key, target = "ms") {
   endpoint <- "https://pxtextmining-docker-api.azurewebsites.net/api/StartContainerInstance"
 
+  # validate the target argument
+  stopifnot("target must be one of 'ms', 'm' or 's'" = target %in% c("ms", "m", "s"))
+
   # convert the dataframe to nested list
   json_data <- data |>
     purrr::array_tree()
 
   r <- httr::POST(
     endpoint,
     body = json_data,
-    query = list(code = api_key),
+    query = list(code = api_key, target = target),
     encode = "json"
   )
 
@@ -62,6 +67,34 @@ get_pred_from_url <- function(api_url) {
   }
 }
 
+#' convert the API prediction into a format that is suitable for the database
+#'
+#' @param prediction dataframe, prediction from the pxtextmining api with column "comment_id","labels" and "sentiment"
+#'
+#' @return dataframe, transform prediction data
+#' @noRd
+transform_prediction_for_database <- function(prediction) {
+  stopifnot('"comment_id","labels" and "sentiment" columns are required' = all(c("comment_id", "labels", "sentiment") %in% names(prediction)))
+
+  prediction |>
+    dplyr::rename(
+      category = labels
+    ) |>
+    dplyr::mutate(dplyr::across(c(comment_id, sentiment), as.integer)) |>
+    #  assign the super categories
+    tidyr::unnest(category) |> # Unnest the category column into rows and columns
+    dplyr::mutate(super_category = assign_highlevel_categories(category)) |> # assign super categories
+    dplyr::group_by(comment_id) %>%
+    dplyr::summarise(
+      across(-tidyselect::all_of(c("category", "super_category")), unique),
+      across(c(category, super_category), list) # duplicate super category value is preserved. This will allow easy manipulation later. for ex. see get_tidy_filter_data()
+    ) |>
+    dplyr::ungroup() |>
+    # convert the category and super category column to raw json before loading into the database
+    dplyr::mutate(across(c(category, super_category), ~ purrr::map(.x, jsonlite::toJSON))) |>
+    dplyr::mutate(across(c(category, super_category), ~ purrr::map(.x, charToRaw)))
+}
+
 #' Track the API job table. If prediction is done, it writes it to the main table and delete the job from the api job table
 #'
 #' @param job an instance of the api job table
@@ -91,20 +124,22 @@ track_api_job <- function(job, conn, write_db = TRUE) {
   if (is.data.frame(prediction)) {
     cat("Job", job_id, "is done \n")
 
-    prediction <- prediction |>
-      dplyr::mutate(dplyr::across(dplyr::everything(), as.integer))
-
     # update the job status as complete (Prediction has been returned)
     DBI::dbExecute(conn, paste("UPDATE api_jobs SET status='completed' WHERE job_id =", job_id))
 
     if (!write_db) {
       # update the job status as uploaded (successfully write prediction to main table)
       DBI::dbExecute(conn, paste("UPDATE api_jobs SET status='uploaded' WHERE job_id =", job_id))
-      
+
       return(prediction)
     }
 
+    prediction <- prediction |>
+      transform_prediction_for_database()
+
     # update the main table
+    cat("Updating database with prediction \n")
+
     dplyr::rows_update(
       dplyr::tbl(conn, trust_id),
       prediction,
@@ -192,8 +227,8 @@ api_question_code <- function(value) {
 #' @param pool database connection
 #' @param trust_id string, the trust id
 #' @param schedule_time integer, number of mins used in the Rmarkdown schedule that tracks the api job table.
-#' default is 15mins. its only needed to guess how long users might have to wait for the prediction to complete 
-#' 
+#' default is 15mins. it's only needed to guess how long users might have to wait for the prediction to complete
+#'
 #' @return list of latest_time and estimated_wait_time (in mins)
 #' @noRd
 check_api_job <- function(pool, trust_id = get_golem_config("trust_name"), schedule_time = 15) {
@@ -207,15 +242,15 @@ check_api_job <- function(pool, trust_id = get_golem_config("trust_name"), sched
     dplyr::filter(
       trust_id == !!trust_id,
       status %in% c("submitted", "completed")
-    ) 
+    )
 
   if (data |>
     dplyr::tally() |>
     dplyr::pull() < 1) {
     return(list("latest_time" = NULL, "estimated_wait_time" = 0))
   } else {
     data <- data |>
-    dplyr::filter(date == max(date, na.rm = TRUE))
+      dplyr::filter(date == max(date, na.rm = TRUE))
   }
 
   latest_time <- data |>
@@ -236,5 +271,7 @@ check_api_job <- function(pool, trust_id = get_golem_config("trust_name"), sched
     TRUE ~ 120
   )
 
+  estimated_wait_time <- if (estimated_wait_time > 0) estimated_wait_time else 10
+
   list("latest_time" = latest_time, "estimated_wait_time" = estimated_wait_time)
 }
diff --git a/R/golem_utils_server.R b/R/golem_utils_server.R
@@ -343,7 +343,7 @@ assign_highlevel_categories <- function(sub_cats) {
     hl_cat <- framework %>%
       dplyr::filter(`Sub-category` == v) %>%
       dplyr::pull(Category)
-    return(if (length(hl_cat) != 0) hl_cat else "Other Category")
+    return(if (length(hl_cat) != 0) hl_cat else "Unknown Category")
   },
   simplify = TRUE, USE.NAMES = FALSE
   )
@@ -420,4 +420,30 @@ transform_sentiment <- function(data, sentiment_column = 'sentiment') {
     dplyr::mutate(
       sentiment = factor(!!rlang::sym(sentiment_column), levels = c("Positive", "Neutral/Mixed", "Negative"))
     )
-}
+}
+
+#' Find rows containing missing values in all specified 
+#' column while keeping rows where any contains values
+#'
+#' @param df A data frame
+#' @param vars list of strings containing the columns
+#' @param negate logical, whether to return the rows where the NA check is true or otherwise
+#' default is `TRUE`
+#'
+#' @return data frame
+#' @export
+drop_na_for_col <- function(df, vars, negate = TRUE) {
+
+  diff <- setdiff(vars, names(df))
+  stopifnot("Some column doesn't exist in data" = length(diff) == 0)
+
+  if (negate) {
+    return(
+      df %>% 
+        dplyr::filter(rowSums(is.na(dplyr::select(., dplyr::all_of(vars)))) != length(vars))
+    )
+  }
+
+  df %>% 
+    dplyr::filter(rowSums(is.na(dplyr::select(., dplyr::all_of(vars)))) == length(vars))
+}
diff --git a/R/mod_data_management.R b/R/mod_data_management.R
@@ -471,7 +471,7 @@ mod_data_management_server <- function(id, db_conn, filter_data, data_exists, us
           showModal(modalDialog(
             title = strong("Success!"),
             HTML(paste(
-              h5(paste(nrow(raw_df), "records successfully imported. The new data is not ready yet, the dashboard is busy predicting the sentiment score.")),
+              h5(paste(nrow(raw_df), "records successfully imported. The dashboard is still processing data to predict category and sentiment")),
               h4(strong(em(paste("Please check back or refresh your browser in about", wait_time, "mins to access the new data"))))
             )),
             easyClose = FALSE

diff --git a/R/mod_documentation_page.R b/R/mod_documentation_page.R
@@ -8,28 +8,41 @@
 mod_documentation_page_ui <- function(id) {
   ns <- NS(id)
   tagList(
-    br(),
-    p("This dashboard uses a machine learning tool (pxtextmining API) to assign one or more sub-categories to free text comments.
-    The categories and subcategories developed in the Qualitative Data Categorisation (QDC) framework are used.
-    The visualisations and interactivity in this dashboard have been chosen to help users to engage with the comments and
-    not just quantify the data."),
-    p("The QDC framework is an evidence-based work that has been carefully designed. it has multiple categories, each with its
-    own set of sub-categories. The category groups similar topics together in a meaningful way to help users navigate the framework
-    more easily. The sub-categories are the actual topics that better reflect the underlying data.
-    A high-level visual of the categories and sub-categories is displayed below:"),
-    img(src = "www/framework_v5.png", width = "100%"),
-    hr(),
-    p("To see detailed description of the sub-categories, please expand the categories below"),
+    h3(strong("Introduction to the Data Categorisation Framework"), style = "color : #005EB8;"),
+    p("This dashboard utilizes the pxtextmining API, a machine learning tool, to assign one or more subcategories to free-text comments
+      based on the Qualitative Data Categorization (QDC) framework. The QDC framework is an evidence-based work that has been designed
+      with several categories, each with its own set of subcategories. The categories group similar topics together to make it easier
+      for users to navigate the framework, while the subcategories reflect the actual topics that better represent the underlying data."),
+    p("The dashboard's visualizations and intuitive interactivity are thoughtfully created to help users effectively engage with the
+      comments and not merely quantify the data. Below is a high-level visual of the categories and subcategories:"),
+    img(src = "www/framework_MVP_version.jpeg", width = "100%"),
+    rep_br(2),
+    p("To see detailed description of the sub-categories, kindly click on the category to expand it."),
     DT::DTOutput(ns("framework_table")),
-    hr(),
     tagList(
-      p("To get further detail about the data categorisation framework and the dashboard
+      h4(strong("Making best use of the qualitative comments"), style = "color : #005EB8;"),
+      HTML(paste0(
+        "This dashboard should be used to facilitate initial exploration of your qualitative data, before
+        drawing fuller insight from the underlying qualitative comments. Before using the dashboard, you
+        should read the good practice guidance on the documentation page: ",
+        a(strong("Good practice guidance."),
+          href = "https://cdu-data-science-team.github.io/PatientExperience-QDC/dashboard/dashboard3.html",
+          target = "_blank"
+        ),
+        "This includes important
+        information, tips, and advice to help you maximise your use of the categorised qualitative comments,
+        whilst avoiding the risks around relying on the quantification of qualitative data."
+      )),
+      rep_br(2),
+      HTML(paste(
+        "To get further detail about the data categorisation framework and the dashboard
         including some illustrative examples for each of the sub-categories.
-        Please see the"),
-      a(p("Patient Experience - QDC documentation Page"),
-        href = "https://cdu-data-science-team.github.io/PatientExperience-QDC/framework/framework3.html",
-        target = "_blank"
-      )
+        Please see the",
+        a(strong("Patient Experience - QDC documentation Page."),
+          href = "https://cdu-data-science-team.github.io/PatientExperience-QDC/framework/framework3.html",
+          target = "_blank"
+        )
+      ))
     ),
   )
 }
@@ -42,7 +55,6 @@ mod_documentation_page_server <- function(id) {
 
     # table
     output$framework_table <- DT::renderDT({
-
       # JaveScript code to collapse the table
       callback_js <- DT::JS(
         "table.on('click', 'tr.dtrg-group', function () {",

diff --git a/R/tidy_data.R b/R/tidy_data.R
@@ -12,10 +12,9 @@ tidy_all_trusts <- function(data) {
       dplyr::filter(
         !is.na(comment_txt),
         !is.null(comment_txt),
-        comment_txt != "NA",
-        comment_txt != "NULL",
-        comment_txt != "",
-        comment_txt != " ",
+        !comment_txt %in% c("Did not answer", "NULL", 
+                            "#NAME?", "NA", "N/A", "", " "),
+        nchar(comment_txt) > 1,
         hidden == 0
       )
   } else {