Skip to content

Commit

Permalink
update chart to accept named arguments in cmd line invocation of R sc…
Browse files Browse the repository at this point in the history
…ript
  • Loading branch information
jeanetteclark committed Sep 17, 2024
1 parent 066ab39 commit 2c68825
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 47 deletions.
2 changes: 1 addition & 1 deletion helm/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ version: 0.1.0
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.0.0"
appVersion: "1.1.0"
4 changes: 2 additions & 2 deletions helm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ RUN apt-get update && apt-get install -y \
libjpeg-dev \
&& rm -rf /var/lib/apt/lists/*

RUN Rscript -e "install.packages(c('devtools', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate'))"
RUN Rscript -e "devtools::install_github('dataoneorg/scythe@89d52978', dependencies = TRUE)"
RUN Rscript -e "install.packages(c('remotes', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate', 'optparse'))"
RUN Rscript -e "remotes::install_github('dataoneorg/scythe@89d52978')"

USER scythe:scythe

Expand Down
3 changes: 2 additions & 1 deletion helm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ already in the metrics service are removed, and the citations are written to a c
In `values.yaml`, key fields to configure are:

- **`cronjob.schedule`**: Schedule for the CronJob (in cron format).
- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script.
- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script, as a comma separated list
- **`cronjob.rows`**: Optional number of rows to return per node when getting DOIs. Leave empty to return all identifiers

## API Keys

Expand Down
87 changes: 49 additions & 38 deletions helm/scripts/search.R
Original file line number Diff line number Diff line change
@@ -1,74 +1,85 @@
# search.R
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library(dplyr))
library(tidyr)
library(scythe)
library(dataone)
library(jsonlite)
suppressPackageStartupMessages(library(lubridate))
library(optparse)

sources <- c("plos", "xdd", "scopus", "springer")
nodes <- commandArgs(trailingOnly = TRUE)
main <- function(){

option_list <- list(
make_option(c("-r", "--rows"), type="integer", default=100000,
help="Number of rows to return from query [default %default]"),
make_option(c("-n", "--nodes"), type="character", help="Comma separated list of nodes to query")
)

# parse command-line arguments
parser <- OptionParser(option_list=option_list)
opts <- parse_args(parser)

num_rows <- opts$rows
nodes <- strsplit(opts$nodes, ",", fixed = TRUE)[[1]]

sources <- c("plos", "xdd", "scopus", "springer")

dois <- c()
for (node in nodes){
message(paste("Gathering DOIs for: ", node))
node_dois <- get_node_dois(node, num_rows)
dois <- c(dois, node_dois)
}
dois_unique <- unique(dois)

# set up file to write to
today <- format(Sys.Date(), "%Y%m%d")
fp <- paste0("scythe-citations-", today, ".csv")

message("Beginning citations search.")
found_citations <- citation_search(dois_unique, sources)

if (is.null(found_citations) || nrow(found_citations) == 0){
writeLines("No citations found.", fp)
} else {
existing_citations <- get_metrics_citations()
new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id"))
if (nrow(new_citations) > 0) {
write.csv(new_citations, fp, row_names = FALSE)
} else {
writeLines("No new citations found.", fp)
}
}
}

get_node_dois <- function(node_id) {
get_node_dois <- function(node_id, num_rows) {
mn <- getMNode(CNode("PROD"), node_id)
queryParamList <- list(q="id:doi* OR seriesId:doi*",
fl="id, seriesId",
start ="0",
rows = "10")
rows = num_rows)
result <- query(mn, solrQuery=queryParamList, as="data.frame", parse=FALSE)
pids <- c(result$id, result$seriesId)
dois <- grep("doi:", pids, value = TRUE)
return(dois)
}

get_metrics_citations <- function(from = as.POSIXct("2000-01-01"), to = as.POSIXct(Sys.Date())){

from <- as.Date(from); to <- as.Date(to)
from_q <- paste(stringr::str_pad(month(from), 2, side = "left", pad = "0"),
stringr::str_pad(day(from), 2, side = "left", pad = "0"),
stringr::str_pad(year(from), 2, side = "left", pad = "0"),
sep = "/")

to_q <- paste(stringr::str_pad(month(to), 2, side = "left", pad = "0"),
stringr::str_pad(day(to), 2, side = "left", pad = "0"),
stringr::str_pad(year(to), 2, side = "left", pad = "0"),
sep = "/")

d <- fromJSON(paste0('https://logproc-stage-ucsb-1.test.dataone.org/metrics?q={%22metricsPage%22:{%22total%22:0,%22start%22:0,%22count%22:0},%22metrics%22:[%22citations%22],%22filterBy%22:[{%22filterType%22:%22repository%22,%22values%22:[%22urn:node:ARCTIC%22],%22interpretAs%22:%22list%22},{%22filterType%22:%22month%22,%22values%22:[%22', from_q,'%22,%22', to_q, '%22],%22interpretAs%22:%22range%22}],%22groupBy%22:[%22month%22]}'))

output_json <- d$resultDetails$citations # pulls citation info
output_df <- as.data.frame(do.call(rbind, output_json), row.names = FALSE) # binds nested cit info into dataframe

output_df <- output_df %>%
unnest_longer(target_id) %>%
unnest_longer(source_id)

return(output_df)
}

dois <- c()
for (node in nodes){
message(paste("Gathering DOIs for: ", node))
node_dois <- get_node_dois(node)
dois <- c(dois, node_dois)
}
dois_unique <- unique(dois)

# set up file to write to
today <- format(Sys.Date(), "%Y%m%d")
fp <- paste0("scythe-citations-", today, ".csv")

message("Beginning citations search.")
found_citations <- citation_search(dois_unique, sources)

if (is.null(found_citations) || nrow(found_citations) == 0){
writeLines("No citations found.", fp)
} else {
existing_citations <- get_metrics_citations()
new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id"))
if (nrow(new_citations) > 0) {
write.csv(new_citations, fp, row_names = FALSE)
} else {
writeLines("No new citations found.", fp)
}
}
main()
6 changes: 5 additions & 1 deletion helm/templates/cron-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ spec:
command:
- /bin/sh
- -c
- {{ .Values.cronjob.command }} {{- range .Values.cronjob.nodes }} {{ . }} {{ end }}
- {{- if .Values.cronjob.rows }}
{{ .Values.cronjob.command }} -r {{ .Values.cronjob.rows }} -n {{ .Values.cronjob.nodes }}
{{- else }}
{{ .Values.cronjob.command }} -n {{ .Values.cronjob.nodes }}
{{- end }}
volumeMounts:
- name: {{ .Values.persistence.claimName }}
mountPath: {{ .Values.persistence.mountPath }}
Expand Down
8 changes: 4 additions & 4 deletions helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ persistence:

cronjob:
name: scythe
schedule: "* * 1 * *"
schedule: "0 12 1 * *"
command: Rscript --vanilla /apps/scythe/search.R
nodes:
- urn:node:ARCTIC
- urn:node:ESS_DIVE
nodes: 'urn:node:ARCTIC,urn:node:ESS_DIVE'
# leave blank if returning all rows
rows:

serviceAccount:
# Specifies whether a service account should be created
Expand Down

0 comments on commit 2c68825

Please sign in to comment.