From a6c9b77104a2d94f7d59c4aa97304988ff6ec75e Mon Sep 17 00:00:00 2001 From: SpacemanSteve Date: Fri, 14 Feb 2020 13:01:28 -0500 Subject: [PATCH] add local cache of classic data files --- config.py | 9 ++++----- copy_input_files.sh | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) create mode 100755 copy_input_files.sh diff --git a/config.py b/config.py index 3543fc3..d6cf48a 100644 --- a/config.py +++ b/config.py @@ -22,11 +22,10 @@ #Order matches their priority BIBCODE_FILES = [ - '/proj/ads/abstracts/ast/load/current/index.status', - '/proj/ads/abstracts/phy/load/current/index.status', - '/proj/ads/abstracts/gen/load/current/index.status', - '/proj/ads/abstracts/pre/load/current/index.status', - + './logs/input/current/ast/load/current/index.status', + './logs/input/current/phy/load/current/index.status', + './logs/input/current/gen/load/current/index.status', + './logs/input/current/pre/load/current/index.status', ] BIBCODES_PER_JOB = 100 diff --git a/copy_input_files.sh b/copy_input_files.sh new file mode 100755 index 0000000..edf9f1e --- /dev/null +++ b/copy_input_files.sh @@ -0,0 +1,41 @@ +set -e + +INPUT_BASE=/proj/ads/abstracts/ +TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S) +OUTPUT_BASE=./logs/input/input.$TIMESTAMP/ + +FILES_INFO=( + ast/load/current/index.status:2100000 + phy/load/current/index.status:9000000 + gen/load/current/index.status:1300000 + pre/load/current/index.status:1500000 +) + +# Delete old input files +if [ -d ./logs/input ]; then + find ./logs/input/ -name "input.20*-*-*_*-*-*" -type d -mtime +7 -exec rm -rf '{}' \; +fi + +# create local copies of files +for FILE_INFO in ${FILES_INFO[@]} ; do + FILE=${FILE_INFO%%:*} + mkdir -p $(dirname "$OUTPUT_BASE$FILE") + echo INFO: `date` copying $INPUT_BASE$FILE to $OUTPUT_BASE$FILE + cp -v $INPUT_BASE$FILE $OUTPUT_BASE$FILE +done + +# validate local files +for FILE_INFO in ${FILES_INFO[@]} ; do + FILE=${FILE_INFO%%:*} + MIN_LINES=${FILE_INFO##*:} + echo INFO: `date` validating $OUTPUT_BASE$FILE is at least $MIN_LINES lines long + if [ $(wc -l < $OUTPUT_BASE$FILE) -lt ${MIN_LINES} ]; then + echo "ERROR: file $OUTPUT_BASE$FILE has less than ${MIN_LINES} lines, processing aborted" + exit 1 + fi +done + +# ingest code expects latest files in directory named current +echo INFO: `date` linking $PWD/logs/input/current to $PWD/$OUTPUT_BASE +rm -fv ./logs/input/current +ln -fsv $PWD/$OUTPUT_BASE $PWD/logs/input/current