From 2c68825d93249c324f7273a8edfca1af308d080f Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 16:16:42 -0700 Subject: [PATCH] update chart to accept named arguments in cmd line invocation of R script --- helm/Chart.yaml | 2 +- helm/Dockerfile | 4 +- helm/README.md | 3 +- helm/scripts/search.R | 87 ++++++++++++++++++++---------------- helm/templates/cron-job.yaml | 6 ++- helm/values.yaml | 8 ++-- 6 files changed, 63 insertions(+), 47 deletions(-) diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 9318f95..56e9554 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -21,4 +21,4 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.0.0" +appVersion: "1.1.0" diff --git a/helm/Dockerfile b/helm/Dockerfile index a792f3c..7419b97 100644 --- a/helm/Dockerfile +++ b/helm/Dockerfile @@ -21,8 +21,8 @@ RUN apt-get update && apt-get install -y \ libjpeg-dev \ && rm -rf /var/lib/apt/lists/* -RUN Rscript -e "install.packages(c('devtools', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate'))" -RUN Rscript -e "devtools::install_github('dataoneorg/scythe@89d52978', dependencies = TRUE)" +RUN Rscript -e "install.packages(c('remotes', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate', 'optparse'))" +RUN Rscript -e "remotes::install_github('dataoneorg/scythe@89d52978')" USER scythe:scythe diff --git a/helm/README.md b/helm/README.md index eba7392..88a5433 100644 --- a/helm/README.md +++ b/helm/README.md @@ -14,7 +14,8 @@ already in the metrics service are removed, and the citations are written to a c In `values.yaml`, key fields to configure are: - **`cronjob.schedule`**: Schedule for the CronJob (in cron format). -- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script. +- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script, as a comma separated list +- **`cronjob.rows`**: Optional number of rows to return per node when getting DOIs. Leave empty to return all identifiers ## API Keys diff --git a/helm/scripts/search.R b/helm/scripts/search.R index 2c08ae6..0daace3 100644 --- a/helm/scripts/search.R +++ b/helm/scripts/search.R @@ -1,74 +1,85 @@ -# search.R +#!/usr/bin/env Rscript suppressPackageStartupMessages(library(dplyr)) library(tidyr) library(scythe) library(dataone) library(jsonlite) suppressPackageStartupMessages(library(lubridate)) +library(optparse) -sources <- c("plos", "xdd", "scopus", "springer") -nodes <- commandArgs(trailingOnly = TRUE) +main <- function(){ + + option_list <- list( + make_option(c("-r", "--rows"), type="integer", default=100000, + help="Number of rows to return from query [default %default]"), + make_option(c("-n", "--nodes"), type="character", help="Comma separated list of nodes to query") + ) + + # parse command-line arguments + parser <- OptionParser(option_list=option_list) + opts <- parse_args(parser) + + num_rows <- opts$rows + nodes <- strsplit(opts$nodes, ",", fixed = TRUE)[[1]] + + sources <- c("plos", "xdd", "scopus", "springer") + + dois <- c() + for (node in nodes){ + message(paste("Gathering DOIs for: ", node)) + node_dois <- get_node_dois(node, num_rows) + dois <- c(dois, node_dois) + } + dois_unique <- unique(dois) + + # set up file to write to + today <- format(Sys.Date(), "%Y%m%d") + fp <- paste0("scythe-citations-", today, ".csv") + + message("Beginning citations search.") + found_citations <- citation_search(dois_unique, sources) + + if (is.null(found_citations) || nrow(found_citations) == 0){ + writeLines("No citations found.", fp) + } else { + existing_citations <- get_metrics_citations() + new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id")) + if (nrow(new_citations) > 0) { + write.csv(new_citations, fp, row_names = FALSE) + } else { + writeLines("No new citations found.", fp) + } + } +} -get_node_dois <- function(node_id) { +get_node_dois <- function(node_id, num_rows) { mn <- getMNode(CNode("PROD"), node_id) queryParamList <- list(q="id:doi* OR seriesId:doi*", fl="id, seriesId", start ="0", - rows = "10") + rows = num_rows) result <- query(mn, solrQuery=queryParamList, as="data.frame", parse=FALSE) pids <- c(result$id, result$seriesId) dois <- grep("doi:", pids, value = TRUE) return(dois) } - get_metrics_citations <- function(from = as.POSIXct("2000-01-01"), to = as.POSIXct(Sys.Date())){ - from <- as.Date(from); to <- as.Date(to) from_q <- paste(stringr::str_pad(month(from), 2, side = "left", pad = "0"), stringr::str_pad(day(from), 2, side = "left", pad = "0"), stringr::str_pad(year(from), 2, side = "left", pad = "0"), sep = "/") - to_q <- paste(stringr::str_pad(month(to), 2, side = "left", pad = "0"), stringr::str_pad(day(to), 2, side = "left", pad = "0"), stringr::str_pad(year(to), 2, side = "left", pad = "0"), sep = "/") - d <- fromJSON(paste0('https://logproc-stage-ucsb-1.test.dataone.org/metrics?q={%22metricsPage%22:{%22total%22:0,%22start%22:0,%22count%22:0},%22metrics%22:[%22citations%22],%22filterBy%22:[{%22filterType%22:%22repository%22,%22values%22:[%22urn:node:ARCTIC%22],%22interpretAs%22:%22list%22},{%22filterType%22:%22month%22,%22values%22:[%22', from_q,'%22,%22', to_q, '%22],%22interpretAs%22:%22range%22}],%22groupBy%22:[%22month%22]}')) - output_json <- d$resultDetails$citations # pulls citation info output_df <- as.data.frame(do.call(rbind, output_json), row.names = FALSE) # binds nested cit info into dataframe - output_df <- output_df %>% unnest_longer(target_id) %>% unnest_longer(source_id) - return(output_df) } -dois <- c() -for (node in nodes){ - message(paste("Gathering DOIs for: ", node)) - node_dois <- get_node_dois(node) - dois <- c(dois, node_dois) -} -dois_unique <- unique(dois) - -# set up file to write to -today <- format(Sys.Date(), "%Y%m%d") -fp <- paste0("scythe-citations-", today, ".csv") - -message("Beginning citations search.") -found_citations <- citation_search(dois_unique, sources) - -if (is.null(found_citations) || nrow(found_citations) == 0){ - writeLines("No citations found.", fp) -} else { - existing_citations <- get_metrics_citations() - new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id")) - if (nrow(new_citations) > 0) { - write.csv(new_citations, fp, row_names = FALSE) - } else { - writeLines("No new citations found.", fp) - } -} \ No newline at end of file +main() diff --git a/helm/templates/cron-job.yaml b/helm/templates/cron-job.yaml index 54174a0..79de389 100644 --- a/helm/templates/cron-job.yaml +++ b/helm/templates/cron-job.yaml @@ -28,7 +28,11 @@ spec: command: - /bin/sh - -c - - {{ .Values.cronjob.command }} {{- range .Values.cronjob.nodes }} {{ . }} {{ end }} + - {{- if .Values.cronjob.rows }} + {{ .Values.cronjob.command }} -r {{ .Values.cronjob.rows }} -n {{ .Values.cronjob.nodes }} + {{- else }} + {{ .Values.cronjob.command }} -n {{ .Values.cronjob.nodes }} + {{- end }} volumeMounts: - name: {{ .Values.persistence.claimName }} mountPath: {{ .Values.persistence.mountPath }} diff --git a/helm/values.yaml b/helm/values.yaml index 81308f5..033b68a 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -21,11 +21,11 @@ persistence: cronjob: name: scythe - schedule: "* * 1 * *" + schedule: "0 12 1 * *" command: Rscript --vanilla /apps/scythe/search.R - nodes: - - urn:node:ARCTIC - - urn:node:ESS_DIVE + nodes: 'urn:node:ARCTIC,urn:node:ESS_DIVE' + # leave blank if returning all rows + rows: serviceAccount: # Specifies whether a service account should be created