Skip to content

Commit

Permalink
Merge pull request #42 from DataONEorg/develop
Browse files Browse the repository at this point in the history
Add helm chart to do citation searches on kubernetes
  • Loading branch information
jeanetteclark authored Sep 18, 2024
2 parents 4208ad2 + b8ba920 commit dea912f
Show file tree
Hide file tree
Showing 16 changed files with 455 additions and 12 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: scythe
Title: Harvest and register data package citations
Version: 1.0.0
Version: 1.1.0
Authors@R: c(
person("Jeanette", "Clark", role = c("aut", "cre"), email = "[email protected]", comment=c(ORCID = "0000-0003-4703-1974")),
person("Matthew B.", "Jones", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0003-0077-4738")),
Expand Down
8 changes: 7 additions & 1 deletion R/citation_search.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@ citation_search <- function(identifiers,
})

# Combine the resulting data frames and return the result df
result <- dplyr::bind_rows(result_df_list)
if (all(sapply(result_df_list, function(x) nrow(x) == 0))) {
# If all data frames are empty, create an empty data frame with the same structure
result <- data.frame()
} else {
# Otherwise, bind the rows
result <- dplyr::bind_rows(result_df_list)
}

return(result)
}
Expand Down
18 changes: 10 additions & 8 deletions R/citation_search_scopus.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,20 @@ citation_search_scopus <- function(identifiers) {
report_est_wait(length(identifiers), wait_seconds)

key <- scythe_get_key("scopus")

# initialize df for storing results in orderly fashion
scopus_results <- data.frame(
article_id = character(),
article_title = character(),
dataset_id = character(),
source = character()
)

if (is.na(key)) {
warning(
"Skipping Scopus search due to missing API key. Set an API key using scythe_set_key() to include Scopus results."
)
return()
return(scopus_results)
}
identifiers_enc <- utils::URLencode(identifiers, reserved = TRUE)

Expand All @@ -40,13 +49,6 @@ citation_search_scopus <- function(identifiers) {
))
}

# initialize df for storing results in orderly fashion
scopus_results <- data.frame(
article_id = character(),
article_title = character(),
dataset_id = character(),
source = character()
)

# extract relevant information from raw results
for (i in 1:length(results)) {
Expand Down
10 changes: 9 additions & 1 deletion R/citation_search_springer.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,21 @@ citation_search_springer <- function(identifiers) {
report_est_wait(length(identifiers), wait_seconds)

identifiers <- check_identifiers(identifiers)

# initialize df for storing results in orderly fashion
springer_results <- data.frame(
article_id = character(),
article_title = character(),
dataset_id = character(),
source = character()
)

key <- scythe_get_key("springer")
if (is.na(key)) {
warning(
"Skipping Springer search due to missing API key. Set an API key using scythe_set_key() to include Springer results."
)
return()
return(springer_results)
}

identifiers_enc <- utils::URLencode(identifiers, reserved = TRUE)
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ including Scopus, PLOS, Springer, and XDD.
### Released version

```
remotes::install_github("DataONEorg/scythe@v1.0.0")
remotes::install_github("DataONEorg/scythe@v1.1.0")
```

The *scythe* R package should be available for use at this point.
Expand Down Expand Up @@ -67,6 +67,7 @@ keyring::key_get("springer", keyring = "scythe")
Work on this package was supported by:

- NSF-PLR grant #1546024 to M. B. Jones, S. Baker-Yeboah, J. Dozier, M. Schildhauer, and A. Budden
- NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier

[![nceas_footer](https://live-ncea-ucsb-edu-v01.pantheonsite.io/sites/default/files/2020-03/NCEAS-full%20logo-4C.png)](https://www.nceas.ucsb.edu)

Expand Down
23 changes: 23 additions & 0 deletions helm/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
24 changes: 24 additions & 0 deletions helm/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: v2
name: scythe
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: "v1.1.0"

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "v1.1.0"
31 changes: 31 additions & 0 deletions helm/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM rocker/r-ver:4

RUN groupadd -r scythe && useradd -r -g scythe scythe
RUN mkdir -p /var/data/scythe && chown scythe.scythe /var/data/scythe
RUN mkdir -p /apps/scythe && chown scythe.scythe /apps/scythe


RUN apt-get update && apt-get install -y \
librdf-dev \
libxml2-dev \
libfontconfig1-dev \
libssl-dev \
libcurl4-gnutls-dev \
libsodium-dev \
libfribidi-dev \
libgit2-dev \
libharfbuzz-dev \
libfreetype6-dev \
libpng-dev \
libtiff5-dev \
libjpeg-dev \
&& rm -rf /var/lib/apt/lists/*

RUN Rscript -e "install.packages(c('remotes', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate', 'optparse'))"
RUN Rscript -e "remotes::install_github('dataoneorg/[email protected]')"

USER scythe:scythe

WORKDIR /var/data/scythe

COPY ./scripts/search.R /apps/scythe/
52 changes: 52 additions & 0 deletions helm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Scythe Helm Chart

This Helm chart deploys a CronJob that does a citation search on a set of DataONE member nodes using `scythe`.

## Search Script and Container

`scripts/search.R` is copied into the Dockerfile and run in the CronJob. It takes the node identifiers
listed in the `values.yaml` file as input. DOIs (either identifiers or series identifiers) are retrieved from each node,
then passed through `scythe::citation_search`, which searches for citations in PLOS, Springer, Scopus, and xDD. Citations
already in the metrics service are removed, and the citations are written to a csv. This table can be passed to `scythe::write_citation_pairs` to create the JSON file needed for ingest into the metrics system.

## CronJob

In `values.yaml`, key fields to configure are:

- **`cronjob.schedule`**: Schedule for the CronJob (in cron format).
- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script, as a comma separated list
- **`cronjob.rows`**: Optional number of rows to return per node when getting DOIs. Leave empty to return all identifiers

## API Keys

For instructions on obtaining an API key, see README.md at the package level.

Keys are made accessible to the deployment using Kubernetes secrets. To set API keys, run:

```
kubectl create secret generic -n scythe api-keys \
--from-literal=springer={key} \
--from-literal=scopus={key}
```

## Persistent Storage

This Helm chart uses a dynamic PVC using CephFS to save results from the `scythe` run. An example configuration file is shown below.
For more information on CephFS on the cluster see [k8s-cluster docs](https://github.com/DataONEorg/k8s-cluster/blob/main/storage/Ceph/Ceph-CSI-CephFS.md#provisioning-dynamic-cephfs-volumes).

```
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: scythe-results
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi
storageClassName: csi-cephfs-sc
```

To create the PVC, run `kubectl apply -f pvc.yaml -n scythe`. This should only be done once.
85 changes: 85 additions & 0 deletions helm/scripts/search.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library(dplyr))
library(tidyr)
library(scythe)
library(dataone)
library(jsonlite)
suppressPackageStartupMessages(library(lubridate))
library(optparse)

main <- function(){

option_list <- list(
make_option(c("-r", "--rows"), type="integer", default=100000,
help="Number of rows to return from query [default %default]"),
make_option(c("-n", "--nodes"), type="character", help="Comma separated list of nodes to query")
)

# parse command-line arguments
parser <- OptionParser(option_list=option_list)
opts <- parse_args(parser)

num_rows <- opts$rows
nodes <- strsplit(opts$nodes, ",", fixed = TRUE)[[1]]

sources <- c("plos", "xdd", "scopus", "springer")

dois <- c()
for (node in nodes){
message(paste("Gathering DOIs for: ", node))
node_dois <- get_node_dois(node, num_rows)
dois <- c(dois, node_dois)
}
dois_unique <- unique(dois)

# set up file to write to
today <- format(Sys.Date(), "%Y%m%d")
fp <- paste0("scythe-citations-", today, ".csv")

message("Beginning citations search.")
found_citations <- citation_search(dois_unique, sources)

if (is.null(found_citations) || nrow(found_citations) == 0){
writeLines("No citations found.", fp)
} else {
existing_citations <- get_metrics_citations()
new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id"))
if (nrow(new_citations) > 0) {
write.csv(new_citations, fp, row_names = FALSE)
} else {
writeLines("No new citations found.", fp)
}
}
}

get_node_dois <- function(node_id, num_rows) {
mn <- getMNode(CNode("PROD"), node_id)
queryParamList <- list(q="id:doi* OR seriesId:doi*",
fl="id, seriesId",
start ="0",
rows = num_rows)
result <- query(mn, solrQuery=queryParamList, as="data.frame", parse=FALSE)
pids <- c(result$id, result$seriesId)
dois <- grep("doi:", pids, value = TRUE)
return(dois)
}
get_metrics_citations <- function(from = as.POSIXct("2000-01-01"), to = as.POSIXct(Sys.Date())){
from <- as.Date(from); to <- as.Date(to)
from_q <- paste(stringr::str_pad(month(from), 2, side = "left", pad = "0"),
stringr::str_pad(day(from), 2, side = "left", pad = "0"),
stringr::str_pad(year(from), 2, side = "left", pad = "0"),
sep = "/")
to_q <- paste(stringr::str_pad(month(to), 2, side = "left", pad = "0"),
stringr::str_pad(day(to), 2, side = "left", pad = "0"),
stringr::str_pad(year(to), 2, side = "left", pad = "0"),
sep = "/")
d <- fromJSON(paste0('https://logproc-stage-ucsb-1.test.dataone.org/metrics?q={%22metricsPage%22:{%22total%22:0,%22start%22:0,%22count%22:0},%22metrics%22:[%22citations%22],%22filterBy%22:[{%22filterType%22:%22repository%22,%22values%22:[%22urn:node:ARCTIC%22],%22interpretAs%22:%22list%22},{%22filterType%22:%22month%22,%22values%22:[%22', from_q,'%22,%22', to_q, '%22],%22interpretAs%22:%22range%22}],%22groupBy%22:[%22month%22]}'))
output_json <- d$resultDetails$citations # pulls citation info
output_df <- as.data.frame(do.call(rbind, output_json), row.names = FALSE) # binds nested cit info into dataframe
output_df <- output_df %>%
unnest_longer(target_id) %>%
unnest_longer(source_id)
return(output_df)
}

main()
8 changes: 8 additions & 0 deletions helm/templates/NOTES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
1. Get the application URL by running these commands:
{{- if .Values.ingress.enabled }}
{{- range $host := .Values.ingress.hosts }}
{{- range .paths }}
http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
{{- end }}
{{- end }}
{{- end }}
62 changes: 62 additions & 0 deletions helm/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "scythe.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "scythe.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "scythe.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Common labels
*/}}
{{- define "scythe.labels" -}}
helm.sh/chart: {{ include "scythe.chart" . }}
{{ include "scythe.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "scythe.selectorLabels" -}}
app.kubernetes.io/name: {{ include "scythe.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Create the name of the service account to use
*/}}
{{- define "scythe.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "scythe.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
Loading

0 comments on commit dea912f

Please sign in to comment.