From 710c83fe95f4e2bef07acab4eb7b56c26ffaa93d Mon Sep 17 00:00:00 2001 From: Zach Bialecki Date: Fri, 17 May 2024 15:17:45 -0400 Subject: [PATCH] Add useful scripts for working with MongoDB dumps --- samples/notebooks/catalyzex_invitations.ipynb | 171 ++++++++++++++++++ utils/convert_to_json | 63 +++++++ utils/download_latest_dump | 38 ++++ 3 files changed, 272 insertions(+) create mode 100644 samples/notebooks/catalyzex_invitations.ipynb create mode 100755 utils/convert_to_json create mode 100755 utils/download_latest_dump diff --git a/samples/notebooks/catalyzex_invitations.ipynb b/samples/notebooks/catalyzex_invitations.ipynb new file mode 100644 index 000000000..c7f85f8e2 --- /dev/null +++ b/samples/notebooks/catalyzex_invitations.ipynb @@ -0,0 +1,171 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Post CatalyzeX Invitations" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "\n", + "from openreview.api import Invitation\n", + "from openreview.api import OpenReviewClient\n", + "from openreview import tools\n", + "from os import getenv\n", + "from openreview import OpenReviewException" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize clients\n", + "\n", + "# local_client = OpenReviewClient(baseurl='http://localhost:3001', username='OpenReview.net', password='')\n", + "dev_client = OpenReviewClient(baseurl='https://devapi2.openreview.net', username='OpenReview.net', password='')\n", + "# live_client = OpenReviewClient(baseurl='https://api2.openreview.net', username='OpenReview.net', password='')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Config vars\n", + "\n", + "client = dev_client" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Posting CatalyzeX invitation for ICLR.cc/2024/Conference/-/Submission\n", + "Done!\n" + ] + } + ], + "source": [ + "# Post invitations\n", + "\n", + "def post(venue_id, submission_invitation_id):\n", + " formatted_venue_id = venue_id.replace('/', '_')\n", + "\n", + " return client.post_invitation_edit(\n", + " invitations=f'{venue_id}/-/Edit',\n", + " readers = [venue_id, 'CatalyzeX.com'],\n", + " writers = [venue_id, 'CatalyzeX.com'],\n", + " signatures = ['OpenReview.net'],\n", + " invitation = Invitation(\n", + " id = f'{venue_id}/-/CatalyzeX',\n", + " invitees = ['CatalyzeX.com'],\n", + " readers = ['CatalyzeX.com'],\n", + " writers = ['CatalyzeX.com'],\n", + " signatures = ['CatalyzeX.com'],\n", + " edit = {\n", + " 'readers': ['everyone'],\n", + " 'signatures': ['CatalyzeX.com'],\n", + " 'note': {\n", + " 'id': {\n", + " 'param': {\n", + " 'withInvitation': submission_invitation_id\n", + " }\n", + " },\n", + " 'writers': ['CatalyzeX.com'],\n", + " 'content': {\n", + " 'community_implementations': {\n", + " 'order': 1,\n", + " 'description': 'Optional link to open source implementations',\n", + " 'value': {\n", + " 'param': {\n", + " 'type': 'string',\n", + " 'maxLength': 500,\n", + " 'input': 'text',\n", + " 'optional': True,\n", + " 'deletable': True,\n", + " 'markdown': True\n", + " }\n", + " }\n", + " }\n", + " }\n", + " },\n", + " 'ddate': {\n", + " 'param': {\n", + " 'range': [ 0, 9999999999999 ],\n", + " 'optional': True,\n", + " 'deletable': True\n", + " }\n", + " }\n", + " },\n", + " duedate = 2556143999000\n", + " )\n", + " )\n", + "\n", + "with open('venue-ids-01052024.txt') as f:\n", + " for line in f:\n", + " group_id = line.strip()\n", + " if group_id == '':\n", + " continue\n", + "\n", + " try:\n", + " group = client.get_group(group_id)\n", + " except OpenReviewException as e:\n", + " print(f'Group not found: {group_id}')\n", + " continue\n", + "\n", + " # Get the submission invitation\n", + " submission_invitation = None\n", + " submission_invitation_name = group.get_content_value('submission_name', default_value='Submission')\n", + " try:\n", + " submission_invitation = client.get_invitation(f'{group_id}/-/{submission_invitation_name}')\n", + " except OpenReviewException as e:\n", + " print(f'Submission invitation not found for {group_id}')\n", + " continue\n", + "\n", + " # Post the CatalyzeX invitation\n", + " print(f'Posting CatalyzeX invitation for {submission_invitation.id}')\n", + " post(group_id, submission_invitation.id)\n", + "\n", + "print('Done!')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/utils/convert_to_json b/utils/convert_to_json new file mode 100755 index 000000000..6810dc582 --- /dev/null +++ b/utils/convert_to_json @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# +# Convert BSON files from a MongoDB dump into JSON files + +set -eEuo pipefail + +main() { + require_deps bsondump + + local dump_dir + dump_dir="./openreview" + + if [[ ! -d "$dump_dir" ]]; then + echo "Error: directory $dump_dir does not exist" + exit 1 + fi + + local allowed_collections + allowed_collections=( + "profiles" + "groups" + "group_edits" + "invitations" + "invitations_v2" + "invitation_edits" + "notes" + "notes_v2" + "note_edits" + "invitation_edits" + "edges" + "tags" + ) + + local short_name + local full_name + local new_name + for collection in "${allowed_collections[@]}"; do + short_name="openreview_$collection.bson" + full_name="$dump_dir/$short_name" + new_name="${full_name/bson/json}" + + rm -f "${new_name}" + + if [[ -f "${full_name}" ]]; then + echo "Converting $short_name..." + bsondump "$full_name" --outFile="$new_name" + sed -i '' -e 's/\\/\\\\/g' -e 's/\\u0000//g' "$new_name" + fi + done + + echo "Done converting bson files!" +} + +require_deps() { + for dep in "$@"; do + command -v "${dep}" >/dev/null 2>&1 || { + echo "Required command ${dep} not found" >&2 + exit 1 + } + done +} + +main "$@" diff --git a/utils/download_latest_dump b/utils/download_latest_dump new file mode 100755 index 000000000..18cb695cb --- /dev/null +++ b/utils/download_latest_dump @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# +# Download the most recent MongoDB dump from Google Cloud Storage and unarchive +# it. Requires gcloud command to be installed and configured. + +set -eEuo pipefail +IFS=$'\n\t' + +readonly BUCKET="gs://openreview-mongodb-dumps" + +main() { + require_deps gcloud + + local latest + latest="$(gcloud storage ls $BUCKET | tail -n 1)" + local filename + filename="${latest/#$BUCKET\//}" + + gcloud storage cp "${latest}" . + + rm -rf ./openreview + + echo "Unarchiving ${filename}..." + tar -zxf "${filename}" --directory . + + echo "Download complete!" +} + +require_deps() { + for dep in "$@"; do + command -v "${dep}" >/dev/null 2>&1 || { + echo "Required command ${dep} not found" >&2 + exit 1 + } + done +} + +main "$@"