openreview · purujitgoyal · Jul 15, 2021 · Jul 15, 2021 · Jul 16, 2021 · Jul 16, 2021
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,18 @@
+[flake8]
+max-line-length = 79
+exclude = .tox,*.egg,build,temp
+select = E,W,F
+max-complexity = 18
+verbose = 2
+# https://pep8.readthedocs.io/en/latest/intro.html#error-codes
+format = pylint
+ignore =
+    E731
+    E741
+    W504
+    F401
+    F841
+    E203  # E203 - whitespace before ':'. Opposite convention enforced by black
+    E231  # E231: missing whitespace after ',', ';', or ':'; for black
+    E501  # E501 - line too long. Handled by black, we have longer lines
+    W503  # W503 - line break before binary operator, need for black
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,4 @@ openreview_expertise.egg-info
 __pycache__
 
 /tmp
-*.log
+*.log
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+default_language_version:
+  python: python3
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+  - repo: https://github.com/ambv/black
+    rev: 21.7b0
+    hooks:
+      - id: black
+        language_version: python3
+  - repo: https://gitlab.com/pycqa/flake8
+    rev: 3.9.2
+    hooks:
+      - id: flake8
+        language_version: python3
diff --git a/README.md b/README.md
@@ -43,13 +43,13 @@ cd specter
 wget https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/specter/archive.tar.gz
 tar -xzvf archive.tar.gz
 
-conda install pytorch cudatoolkit=10.1 -c pytorch 
+conda install pytorch cudatoolkit=10.1 -c pytorch
 pip install -r requirements.txt
 python setup.py install
 conda install filelock
 cd ..
 ```
-Pass the path to the cloned GitHub repository as `model_params.specter_dir`. 
+Pass the path to the cloned GitHub repository as `model_params.specter_dir`.
 
 If you plan to use Multifacet-Recommender / SPECTER+MFR, download the checkpoint files from [here](https://drive.google.com/file/d/1_mWkQ1dr_Vl121WZkbNyNMV3G_bmoQ6s/view?usp=sharing), extract it, and pass the paths:
 ```
@@ -63,6 +63,19 @@ https://www.overleaf.com/read/ygmygwtjbzfg
 
 https://www.overleaf.com/read/swqrxgqqvmyv
 
+If you plan to use SentencePiece Model, you can follow the training procedure mentioned [here](https://github.com/acl-org/reviewer-paper-matching) to train the model and pass the paths to the trained model directory. The model files directory structure expected by the expertise is as follows:
+```
+path_to_trained_model_dir/
+	scratch/
+	    abstracts.sp.20k.model
+	    abstracts.sp.20k.model.model
+	    abstracts.sp.20k.model.vocab
+	    abstracts.sp.20k.vocab
+	    similarity-model.pt
+```
+
+The `path_to_trained_model_dir` should be passed as `model_params.model_dir` in the config discussed in the Configuration section.
+
 ## Affinity Scores
 
 There are two steps to create affinity scores:
@@ -80,7 +93,7 @@ python -m expertise.create_dataset config.json \
 	--username <your_username> \
 ```
 
-For ELMo, SPECTER, Multifacet-Recommender and BM25 run the following command
+For ELMo, SPECTER, Multifacet-Recommender, SentencePiece-ACL and BM25 run the following command
 ```
 python -m expertise.run config.json
 ```
@@ -133,7 +146,7 @@ python -m expertise.service --host localhost --port 5000
 
 By default, the app will run on `http://localhost:5000`. The endpoint `/expertise/test` should show a simple page indicating that Flask is running. Accessing the `/expertise` endpoint to compute affinity scores **requires** valid authentication in the headers of the request (i.e submitted from a logged in Python client)
 
-In order to start the Celery queue worker, use: 
+In order to start the Celery queue worker, use:
 ```
 celery --app expertise.service.server.celery_app worker
 ```
@@ -431,6 +444,38 @@ Here is an example:
 }
 ```
 
+#### SentencePiece-ACL specific parameters (affinity scores):
+- `model_params.model_dir`: Path to the unpacked model directory. The model checkpoint will be loaded relative to this directory.
+- `model_params.batch_size`: Batch size when running SentencePiece Model. This defaults to 32.
+- `model_params.publications_path`: When running SentencePiece, this is where the embedded abstracts/titles of the Reviewers (and Area Chairs) are stored.
+- `model_params.submissions_path`: When running SentencePiece, this is where the embedded abstracts/titles of the Submissions are stored.
+- `model_params.max_score` (boolean, defaults to `true`): This parameter specifies that the reviewer is assigned based on the max similarity of the submission to the authored publication embeddings.
+- `model.params.weighted_topk` (int, defaults to 0): This parameter specifies that the reviewer is assigned based on the weighted average of top `k` similarity score of the submission to the authored publication embeddings. This is skipped if `model_params.max_score` is set to `true`.
+- `model_params.skip_model`: Since running SentencePiece can take a significant amount of time, the vectors are saved in `model_params.submissions_path` and `model_params.publications_path`. The jsonl files will be loaded with all the vectors.
+- `model_params.use_cuda`: Boolean to indicate whether to use GPU (`true`) or CPU (`false`) when running SentencePiece Model. It defaults to CPU (`false`)
+
+Here is an example:
+```
+{
+    "name": "iclr2020_sentence_piece",
+    "dataset": {
+        "directory": "./data/"
+    },
+    "model": "sentence_piece_acl",
+    "model_params": {
+        "model_dir": "../acl-sentence-piece/",
+        "max_score": true,
+        "batch_size": 16,
+        "skip_model": false,
+        "max_score": true,
+        "publications_path": "./",
+        "submissions_path": "./",
+        "use_cuda": false,
+        "scores_path": "./"
+    }
+}
+```
+
 #### ELMo specific parameters (duplicate detection):
 - `model_params.other_submissions_path`: When running ELMo, this is where the embedded abstracts/titles of the other Submissions are stored.
 All the other parameters are the same as in the affinity scores.

diff --git a/expertise/__init__.py b/expertise/__init__.py
@@ -1,10 +1,7 @@
-from .core import *
+from .core import load_model
 from . import config
 from . import dataset
 from . import models
 from . import preprocess
 from . import setup
-from . import test
-from . import train
 from . import utils
-
diff --git a/expertise/config/__init__.py b/expertise/config/__init__.py
@@ -1 +1 @@
-from .core import *
+from .core import ModelConfig
diff --git a/expertise/config/__main__.py b/expertise/config/__main__.py
@@ -1,6 +1,6 @@
-'''
+"""
 
-'''
+"""
 from __future__ import absolute_import
 
 import argparse
@@ -9,18 +9,16 @@
 import expertise
 
 parser = argparse.ArgumentParser()
-parser.add_argument('model', help=f'select one of {expertise.available_models()}')
-parser.add_argument('--outfile', '-o', help='file to write config')
+parser.add_argument("model", help=f"select one of {expertise.available_models()}")
+parser.add_argument("--outfile", "-o", help="file to write config")
 
 args = parser.parse_args()
 
 config = expertise.config.ModelConfig(model=args.model)
 
-outfile = args.outfile if args.outfile else f'./{args.model}.json'
+outfile = args.outfile if args.outfile else f"./{args.model}.json"
 
 experiment_dir = os.path.dirname(os.path.abspath(outfile))
 
 config.update(experiment_dir=experiment_dir)
 config.save(outfile)
-
-
diff --git a/expertise/config/core.py b/expertise/config/core.py
@@ -6,15 +6,16 @@
 import pkgutil
 import expertise
 
+
 class ModelConfig(UserDict):
     def __init__(self, **kwargs):
         super(UserDict, self).__init__()
-        if kwargs.get('config_file_path'):
-            config_file_path = Path(kwargs['config_file_path'])
+        if kwargs.get("config_file_path"):
+            config_file_path = Path(kwargs["config_file_path"])
             with open(config_file_path) as file_handle:
                 self.data = json.load(file_handle)
-        elif kwargs.get('config_dict'):
-            self.data = kwargs['config_dict']
+        elif kwargs.get("config_dict"):
+            self.data = kwargs["config_dict"]
 
     def __repr__(self):
         return json.dumps(self.data, indent=4)
@@ -23,8 +24,8 @@ def update(self, **kwargs):
         self.data = {**self.data, **kwargs}
 
     def save(self, outfile):
-        with open(outfile, 'w') as f:
-            json.dump(self.data, f, indent=4, separators=(',', ': '))
+        with open(outfile, "w") as f:
+            json.dump(self.data, f, indent=4, separators=(",", ": "))
 
     def update_from_file(self, file):
         config_path = Path(file).resolve()

diff --git a/expertise/core.py b/expertise/core.py
@@ -1,11 +1,14 @@
 import pkgutil
 from . import models
 
+
 def model_importers():
     return {m: i for i, m, _ in pkgutil.iter_modules(models.__path__)}
 
+
 def available_models():
     return [k for k in model_importers().keys()]
 
+
 def load_model(module_name):
-	return model_importers()[module_name].find_module(module_name).load_module()
+    return model_importers()[module_name].find_module(module_name).load_module()