Merge pull request #184 from OHI-Science/dev

Merging dev changes into master
OHI-Science · Dec 8, 2016 · c0bf7e1 · c0bf7e1
2 parents 410cebb + 73a6c0a
commit c0bf7e1
Show file tree

Hide file tree

Showing 36 changed files with 756 additions and 442 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: ohicore
 Description: A collection of functions for generically calculating the Ocean
     Health Index scores as well as individual goals and sub-goals.
-Version: 0.1
-Date: 2013-09-25
+Version: 0.2
+Date: 2016-07-28
 Title: Ocean Health Index calculation package
-Author: Ben Best, Steve Hastings, Darren Hardy
-Maintainer: Ben Best <bbest@nceas.ucsb.edu>
+Author: Ben Best, Melanie Frazier, Julia Stewart Lowndes, Casey O'Hara, Ning Jiang Mendes, Jamie Afflerbach, Steve Hastings, Darren Hardy
+Maintainer: Melanie Frazier <frazier@nceas.ucsb.edu>
 License: MIT
 LazyData: TRUE
 Depends:
@@ -40,6 +40,9 @@ Collate:
     'compare_scores_df.R'
     'gapfill_georegions.R'
     'name_to_rgn.R'
+    'name_2_rgn.R'
+    'collapse_2_rgn.R'
+    'data.R'
     'read_git_csv.R'
     'shp_to_geojson.R'
     'trace_git_csv_value.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,9 +10,11 @@ export(Layers)
 export(PlotFlower)
 export(SelectLayersData)
 export(change_plot)
+export(collapse_2_rgn)
 export(compare_scores_df)
 export(gapfill_georegions)
 export(mapvalues)
+export(name_2_rgn)
 export(name_to_rgn)
 export(read_git_csv)
 export(score.clamp)
@@ -24,22 +26,4 @@ exportClasses(Conf)
 exportClasses(Layers)
 import(dplyr)
 import(ggplot2)
-import(git2r)
-import(htmlwidgets)
-import(plotly)
-import(plyr)
-import(reshape2)
 import(tidyr)
-importFrom(dplyr,arrange)
-importFrom(dplyr,desc)
-importFrom(dplyr,failwith)
-importFrom(dplyr,filter)
-importFrom(dplyr,id)
-importFrom(dplyr,intersect)
-importFrom(dplyr,mutate)
-importFrom(dplyr,setdiff)
-importFrom(dplyr,setequal)
-importFrom(dplyr,summarise)
-importFrom(dplyr,summarize)
-importFrom(dplyr,union)
-importFrom(plyr,rename)
diff --git a/R/CalculateAll.R b/R/CalculateAll.R
@@ -51,24 +51,6 @@
 #' ## write
 #' write.csv(scores, 'scores.csv', na='', row.names=F)
 #' }
-#' @import reshape2
-#' @import plyr
-#' @import dplyr
-#' @import tidyr
-#' @import git2r
-#' @importFrom dplyr arrange
-#' @importFrom dplyr desc
-#' @importFrom dplyr failwith
-#' @importFrom dplyr id
-#' @importFrom dplyr mutate
-#' @importFrom dplyr summarize
-#' @importFrom dplyr summarise
-#' @importFrom dplyr filter
-#' @importFrom dplyr intersect
-#' @importFrom dplyr setdiff
-#' @importFrom dplyr setequal
-#' @importFrom dplyr union
-#' @importFrom plyr rename
 #'
 #' @export
 CalculateAll = function(conf, layers){
@@ -102,6 +84,16 @@ CalculateAll = function(conf, layers){
     assign('scores', scores, envir=conf$functions)
     if (nrow(subset(scores, goal==g & dimension %in% c('status','trend')))!=0) stop(sprintf('Scores were assigned to goal %s by previous goal function.', g))
     scores_g = eval(parse(text=goals_X$preindex_function[i]), envir=conf$functions)
+
+    # error if 'status' or 'trend' are missing
+    if ( !all( c('status', 'trend') %in% unique(scores_g$dimension)) ){
+      stop(sprintf('Missing "status" or "trend" dimension in %s goal model\n', g))
+    }
+    # error if something other than 'status' or 'trend' as dimension
+    if ( !all(unique(scores_g$dimension) %in% c('status', 'trend')) ){
+      stop(sprintf('"status" and "trend" should be the only dimensions in %s goal model\n', g))
+    }
+
     if (nrow(scores_g) > 0){
       scores = rbind(scores, scores_g[,c('goal','dimension','region_id','score')])
     }
@@ -112,7 +104,8 @@ CalculateAll = function(conf, layers){
   scores = rbind(scores, scores_P)
 
   ## Calculate Resilience, all goals
-  scores = rbind(scores, CalculateResilienceAll(layers, conf))
+  scores_R = CalculateResilienceAll(layers, conf)
+  scores = rbind(scores, scores_R)
   scores = data.frame(scores)
 
   ## Calculate Goal Score and Likely Future, all goals
@@ -159,7 +152,7 @@ CalculateAll = function(conf, layers){
     scores = rbind(scores, scores_G)
   }
 
-  ## Post-Index functions: supragoals
+  ## Post-Index functions: Calculate Status, Trend, Likely Future State and Scores for 'Supragoals'
   goals_Y = subset(conf$goals, !is.na(postindex_function))
   supragoals = subset(conf$goals, is.na(parent), goal, drop=T); supragoals
 
@@ -172,18 +165,19 @@ CalculateAll = function(conf, layers){
     scores = eval(parse(text=goals_Y$postindex_function[i]), envir=conf$functions)
   }
 
-  ## Calculate Region Index Scores using goal weights
-  cat(sprintf('Calculating Index score for each region for supragoals using goal weights...\n'))
+  ## Calculate Overall Index Scores for each region using goal weights
+  cat(sprintf('Calculating Index Score for each region using goal weights to combine goal scores...\n'))
 
   # calculate weighted-mean Index scores from goal scores and rbind to 'scores' variable
   scores =
     rbind(scores,
-          scores %>%
+        scores %>%
 
             # filter only supragoal scores, merge with supragoal weightings
             dplyr::filter(dimension=='score',  goal %in% supragoals) %>%
             merge(conf$goals %>%
-                    select(goal, weight)) %>%
+                    dplyr::select(goal, weight)) %>%
+            dplyr::mutate(weight = as.numeric(weight)) %>%
 
             # calculate the weighted mean of supragoals, add goal and dimension column
             dplyr::group_by(region_id) %>%
@@ -193,8 +187,8 @@ CalculateAll = function(conf, layers){
             data.frame())
 
 
-  ## Calculate Region Likely Future State Scores using goal weights
-  cat(sprintf('Calculating Likely Future State for each region for supragoals using goal weights...\n'))
+  ## Calculate Overall Index Likely Future State for each region
+  cat(sprintf('Calculating Index Likely Future State for each region...\n'))
 
   # calculate weighted-mean Likely Future State scores and rbind to 'scores' variable
   scores =
@@ -204,7 +198,7 @@ CalculateAll = function(conf, layers){
             # filter only supragoal scores, merge with supragoal weightings
             dplyr::filter(dimension=='future',  goal %in% supragoals) %>%
             merge(conf$goals %>%
-                    select(goal, weight)) %>%
+                    dplyr::select(goal, weight)) %>%
 
             # calculate the weighted mean of supragoals, add goal and dimension column
             dplyr::group_by(region_id) %>%

diff --git a/R/CalculatePressuresAll.R b/R/CalculatePressuresAll.R
@@ -129,7 +129,7 @@ CalculatePressuresAll = function(layers, conf){
 
   ## further preparation of matrix data for analysis
   p_matrix <- p_matrix %>%
-    left_join(p_categories, by="layer") %>%
+    dplyr::left_join(p_categories, by="layer") %>%
     dplyr::group_by(goal, element, category, subcategory) %>%
     dplyr::mutate(max_subcategory = max(m_intensity)) %>%
     data.frame()
@@ -221,7 +221,7 @@ CalculatePressuresAll = function(layers, conf){
   scores <- regions_dataframe %>%
     dplyr::left_join(calc_pressure, by="region_id") %>%
     dplyr::mutate(dimension="pressures") %>%
-    select(goal, dimension, region_id, score=pressure) %>%
-    mutate(score = round(score*100, 2))
+    dplyr::select(goal, dimension, region_id, score=pressure) %>%
+    dplyr::mutate(score = round(score*100, 2))
   return(scores)
 }
diff --git a/R/CalculateResilienceAll.R b/R/CalculateResilienceAll.R
@@ -194,8 +194,8 @@ CalculateResilienceAll = function(layers, conf){
   scores <- regions_dataframe %>%
     dplyr::left_join(calc_resil, by="region_id") %>%
     dplyr::mutate(dimension="resilience") %>%
-    select(goal, dimension, region_id, score=val_num) %>%
-    mutate(score = round(score*100, 2))
+    dplyr::select(goal, dimension, region_id, score=val_num) %>%
+    dplyr::mutate(score = round(score*100, 2))
   return(scores)
 
 }
diff --git a/R/change_plot.R b/R/change_plot.R
@@ -11,8 +11,6 @@
 #' @param save_csv If TRUE, the difference csv file will be saved.
 #' @param save_png If TRUE, a static png of the image will be saved. 
 #' 
-#' @import plotly
-#' @import htmlwidgets
 #' 
 #' @export
 change_plot = function(repo = "ohi-global", scenario="eez2014", commit="previous", 

diff --git a/R/collapse_2_rgn.R b/R/collapse_2_rgn.R
@@ -0,0 +1,113 @@
+#' Collapse region
+#' 
+#' Collapse data for duplicate regions.
+#' 
+#' @param df_in dataset
+#' @param fld_id 
+#' @param fld_value field with value, defaults to 'value'
+#' @param collapse_fxn function to collapse duplicate regions into one (example: China, Macau, Hong Kong)
+#' @param collapse_wts table with weights if weighted mean function is chosen
+#' 
+#' @details This function collapses duplicate regions (example: China, Macau, Hong Kong)
+#'  
+#' @keywords ohi
+#' @export
+collapse_2_rgn <- function(df_in, 
+                         fld_value,
+                         fld_id = 'rgn_id',
+                         collapse_fxn = c('sum','mean','weighted_mean')[1], 
+                         collapse_wts = NULL) {
+  ### Expectation: a data frame, with columns for rgn_id, maybe rgn_name, and some value.
+  ### Group by rgn_id and unique fields; summarize value column(s) according to collapse_fxn.
+  ### collapse_csv? collapse_flds_join (not used...?)?
+  ### collapse_wts will be a .csv or a df with rgn_id and another column to be used for weighting.
+  ### What if other variables not captured by "fld_unique"?
+  ### DETAIL. Return a data.frame (vs add_rgn_id which writes to a csv) 
+  ### and perform extra checks, including collapsing on duplicates.
+  ### Note: The original fld_name lost because possible to collapse multiple 
+  ### countries into a single region.
+
+  ### check for valid arguments  stopifnot(fld_name %in% names(d))
+  ### ? should be warning for these?
+  stopifnot(fld_id       %in% names(df_in))
+  stopifnot(fld_value    %in% names(df_in))
+  stopifnot(all(fld_keep %in% names(df_in)))
+  #  stopifnot(sum(duplicated(df_in[ , fld_keep])) == 0)
+
+  fld_keep_rgn_id <- c(fld_id, setdiff(c(fld_keep), fld_id))
+
+  ### are there duplicates? create index of duplicated records
+  i_dupes <- duplicated(df_in[, fld_keep_rgn_id], fromLast = FALSE) | 
+    duplicated(df_in[, fld_keep_rgn_id], fromLast = TRUE)
+
+  if (sum(i_dupes) == 0) {
+    ### No duplicates - return dataframe as is.
+    cat(sprintf('No duplicates found in %s. \n', paste(fld_keep_rgn_id, collapse = ', ')))
+    df_out <- df_in
+  } else {
+    ### Duplicates found; collapse using function
+    cat(sprintf('\nDuplicate values found for %s. \n', paste(fld_keep_rgn_id, collapse = ', ')))
+    cat(sprintf('Resolving by collapsing %s with collapse_fxn: %s after first removing all NAs from duplicates...\n', 
+                fld_id, collapse_fxn))
+
+    fld_dropped <- names(df_in)[!names(df_in) %in% c(fld_value, fld_keep_rgn_id)]
+    message(sprintf('Dropping variables: %s\n', paste(fld_dropped, collapse = ', ')))
+    message('  Use argument fld_keep to prevent variables from being dropped.\n')
+
+    ### create a data.frame of just the duplicated records, for collapsing
+    df_in_dup <- df_in[i_dupes, ] %>%
+      dplyr::arrange(rgn_id, rgn_name)
+    print(df_in_dup)
+
+    ### set tmp_value to be the value, to protect original value
+    df_in_dup$tmp_value <- df_in_dup[[fld_value]]
+
+    if (collapse_fxn == "sum") {
+      df_in_collapsed <- df_in_dup %>% 
+        dplyr::filter(!is.na(tmp_value)) %>%
+        dplyr::group_by_(.dots = as.list(fld_keep_rgn_id)) %>% 
+        dplyr::summarize(tmp_value = sum(tmp_value)) 
+      ### NOTE: since NAs removed above, no need here... similar below
+    } else if(collapse_fxn == "mean") {
+      df_in_collapsed <- df_in_dup %>% 
+        dplyr::filter(!is.na(tmp_value)) %>% 
+        dplyr::group_by_(.dots=as.list(fld_keep_rgn_id)) %>% 
+        dplyr::summarize(tmp_value = mean(tmp_value))
+    } else if(collapse_fxn == "weighted_mean") {
+      wts_df <- switch(class(collapse_wts),
+                       'character'  = read.csv(collapse_wts, stringsAsFactors = FALSE),
+                       'data.frame' = collapse_wts,
+                       'NULL'       = stop('Must set weighting values for weighted mean'),
+                       as.data.frame(collapse_wts))
+
+      flds_matched <- intersect(names(wts_df), names(df_in_dup))
+      fld_collapse <- setdiff(names(wts_df), names(df_in_dup))
+      stopifnot(length(fld_collapse) == 1)
+      wts_df['tmp_weight'] <- wts_df[fld_collapse]
+      wts_df <- wts_df[, c(flds_matched, "tmp_weight")]
+      df_in_collapsed <- df_in_dup %>% 
+        dplyr::left_join(wts_df, by = flds_matched) %>% 
+        dplyr::filter(!is.na(tmp_value) & !is.na(tmp_weight)) %>% 
+        dplyr::group_by_(.dots = as.list(fld_keep_rgn_id)) %>% 
+        dplyr::summarize(tmp_value = sum(tmp_value * tmp_weight)/sum(tmp_weight))
+    } else {
+      stop("collapse_fxn needs to be a string of one of the following: sum, mean, weighted_mean.")
+    }
+
+    ### clean up tmp_value field, quick check
+    df_in_collapsed <- df_in_collapsed %>%
+      dplyr::rename_(.dots = setNames('tmp_value', fld_value))
+    head(df_in_collapsed)
+
+    df_out <- rbind(df_in[!i_dupes, c(fld_keep_rgn_id, fld_value)],
+                    df_in_collapsed)
+  }
+
+  ### limit to same subset of fields for consistent behavior regardless of duplicates presents
+  df_out <- df_out[, c(fld_keep_rgn_id, fld_value)]
+
+  ### check to ensure no duplicates remaining in kept fields
+  stopifnot(duplicated(df_out[, c(fld_keep_rgn_id)]) == 0) 
+
+  return(as.data.frame(df_out))
+}
diff --git a/R/compare_scores_df.R b/R/compare_scores_df.R
@@ -27,24 +27,24 @@ compare_scores_df = function(a_csv, b_csv, r_csv, g_csv){
 
   # merge
   d = a %>%
-    base::merge(
+    merge(
       b, 
       by=c('goal','dimension','region_id'), 
       suffixes=c('.a','.b')) %>%        
     dplyr::rename(rgn_id=region_id) %>%
-    mutate(
+    dplyr::mutate(
       score.dif = score.a - score.b,
       score.na = is.na(score.a)!=is.na(score.b)) %>%      
-    left_join(
-      rbind_list(
+    dplyr::left_join(
+      dplyr::bind_rows(
         r %>%
-          select(rgn_id, rgn_name=label),
+          dplyr::select(rgn_id, rgn_name=label),
         data.frame(rgn_id=0, rgn_name='GLOBAL')),
       by='rgn_id') %>%
     # filter(abs(score_dif) > 0.01 | score_na == T) %>%
-    arrange(rgn_id!=0, goal!='Index', dimension!='score', goal, desc(dimension), desc(abs(score.dif)), is.na(score.a), is.na(score.b)) %>%
-    select(goal, dimension, rgn_id, rgn_name, score.a, score.b, score.dif) %>%
-    mutate(
+    dplyr::arrange(rgn_id!=0, goal!='Index', dimension!='score', goal, desc(dimension), desc(abs(score.dif)), is.na(score.a), is.na(score.b)) %>%
+    dplyr::select(goal, dimension, rgn_id, rgn_name, score.a, score.b, score.dif) %>%
+    dplyr::mutate(
       goal      = factor(goal, c('Index', g$goal)),
       dimension = factor(dimension, ohi_dimensions),
       id        = row_number())