openreview · haroldrubio · Nov 10, 2021 · Nov 10, 2021 · Nov 10, 2021
diff --git a/README.md b/README.md
@@ -63,6 +63,19 @@ https://www.overleaf.com/read/ygmygwtjbzfg
 
 https://www.overleaf.com/read/swqrxgqqvmyv
 
+If you plan to use SentencePiece Model, you can follow the training procedure mentioned [here](https://github.com/acl-org/reviewer-paper-matching) to train the model and pass the paths to the trained model directory. The model files directory structure expected by the expertise is as follows:
+```
+path_to_trained_model_dir/
+	scratch/
+	    abstracts.sp.20k.model
+	    abstracts.sp.20k.model.model
+	    abstracts.sp.20k.model.vocab
+	    abstracts.sp.20k.vocab
+	    similarity-model.pt
+```
+
+The `path_to_trained_model_dir` should be passed as `model_params.model_dir` in the config discussed in the Configuration section.
+
 ## Affinity Scores
 
 There are two steps to create affinity scores:

diff --git a/expertise/execute_expertise.py b/expertise/execute_expertise.py
@@ -15,6 +15,46 @@ def execute_expertise(config):
     elif Path(config['dataset']['directory']).joinpath('submissions.json').exists():
         submissions_dataset = SubmissionsDataset(submissions_file=Path(config['dataset']['directory']).joinpath('submissions.json'))
 
+    if config["model"] == "sentence_piece_acl":
+        from .models import acl_scorer
+
+        acl_scorer = acl_scorer.Model(
+            batch_size=config["model_params"].get("batch_size", 32),
+            max_score=config["model_params"].get("max_score", True),
+            weighted_topk=config["model_params"].get("weighted_topk", 0),
+            sparse_value=config["model_params"].get("sparse_value", None),
+            normalize=config["model_params"].get("normalize", False),
+            use_cuda=config["model_params"].get("use_cuda", False),
+        )
+        acl_scorer.set_archives_dataset(archives_dataset)
+        acl_scorer.set_submissions_dataset(submissions_dataset)
+        acl_scorer.set_pub_note_author_mapping()
+        acl_scorer.load_model(data=None, model_dir=config["model_params"]["model_dir"])
+
+        if not config["model_params"].get("skip_model", False):
+            acl_scorer.embed_publications(
+                publications_path=Path(
+                    config["model_params"]["publications_path"]
+                ).joinpath("pub2vec.pkl")
+            )
+            acl_scorer.embed_submissions(
+                submissions_path=Path(
+                    config["model_params"]["submissions_path"]
+                ).joinpath("sub2vec.pkl")
+            )
+
+        acl_scorer.all_scores(
+            publications_path=Path(
+                config["model_params"]["publications_path"]
+            ).joinpath("pub2vec.pkl"),
+            submissions_path=Path(config["model_params"]["submissions_path"]).joinpath(
+                "sub2vec.pkl"
+            ),
+            scores_path=Path(config["model_params"]["scores_path"]).joinpath(
+                config["name"] + ".csv"
+            ),
+        )
+
     if config['model'] == 'bm25':
         from .models import bm25
         bm25Model = bm25.Model(

diff --git a/expertise/models/acl_scorer/__init__.py b/expertise/models/acl_scorer/__init__.py
@@ -0,0 +1 @@
+from .scorer import Model
diff --git a/expertise/models/acl_scorer/instance.py b/expertise/models/acl_scorer/instance.py
@@ -0,0 +1,49 @@
+from .utils import unk_string, lookup
+
+
+class Instance(object):
+    def __init__(self, sentence):
+        self.sentence = sentence
+        self.embeddings = []
+
+    def populate_ngrams(self, words, zero_unk, n):
+        embeddings = []
+        if type(self.sentence) == str:
+            sentence = [self.sentence]
+        else:
+            sentence = self.sentence
+        for i in sentence:
+            sent = " " + i.strip() + " "
+
+            for j in range(len(sent)):
+                idx = j
+                gr = ""
+                while idx < j + n and idx < len(sent):
+                    gr += sent[idx]
+                    idx += 1
+                if not len(gr) == n:
+                    continue
+                wd = lookup(words, gr, zero_unk)
+                if wd is not None:
+                    embeddings.append(wd)
+
+        if len(embeddings) == 0:
+            return [words[unk_string]]
+        return embeddings
+
+    def populate_embeddings(self, words, zero_unk, ngrams):
+        if ngrams:
+            self.embeddings = self.populate_ngrams(words, zero_unk, ngrams)
+        else:
+            if type(self.sentence) == str:
+                sentence = [self.sentence]
+            else:
+                sentence = self.sentence
+            for i in sentence:
+                arr = i.split()
+                for i in arr:
+                    wd = lookup(words, i, zero_unk)
+                    if wd is not None:
+                        self.embeddings.append(wd)
+            if len(self.embeddings) == 0:
+                self.embeddings = [words[unk_string]]
diff --git a/expertise/models/acl_scorer/models.py b/expertise/models/acl_scorer/models.py
@@ -0,0 +1,272 @@
+import random
+
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import optim
+from torch.nn.modules.distance import CosineSimilarity
+from torch.nn.utils.rnn import pack_padded_sequence as pack
+from torch.nn.utils.rnn import pad_packed_sequence as unpack
+
+from .instance import Instance
+from .utils import max_pool, mean_pool
+
+
+class ParaModel(nn.Module):
+    def __init__(
+        self, data, args, vocab, vocab_fr, model_dir="expertise/models/acl_scorer/"
+    ):
+        super(ParaModel, self).__init__()
+
+        self.raw_data = data
+        self.args = args
+        self.gpu = args.gpu
+        self.model_dir = model_dir
+
+        self.vocab = vocab
+        self.vocab_fr = vocab_fr
+        self.ngrams = args.ngrams
+        self.seg_length = args.seg_length
+
+        self.delta = args.delta
+        self.pool = args.pool
+
+        self.dropout = args.dropout
+        self.share_encoder = args.share_encoder
+        self.share_vocab = args.share_vocab
+        self.zero_unk = args.zero_unk
+
+        self.batchsize = args.batchsize
+        self.max_megabatch_size = args.megabatch_size
+        self.curr_megabatch_size = 1
+        self.megabatch = []
+        self.megabatch_anneal = args.megabatch_anneal
+        self.increment = False
+
+        self.sim_loss = nn.MarginRankingLoss(margin=self.delta)
+        self.cosine = CosineSimilarity()
+
+        self.embedding = nn.Embedding(len(self.vocab), self.args.dim)
+        if self.vocab_fr is not None:
+            self.embedding_fr = nn.Embedding(len(self.vocab_fr), self.args.dim)
+
+        self.sp = None
+        if args.sp_model:
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(self.model_dir + args.sp_model)
+
+    def save_params(self, epoch):
+        torch.save(
+            {
+                "state_dict": self.state_dict(),
+                "vocab": self.vocab,
+                "vocab_fr": self.vocab_fr,
+                "args": self.args,
+                "optimizer": self.optimizer.state_dict(),
+                "epoch": epoch,
+            },
+            "{0}_{1}.pt".format(self.args.outfile, epoch),
+        )
+        return "{0}_{1}.pt".format(self.args.outfile, epoch)
+
+    def save_final_params(self):
+        print("Saving final model...")
+        torch.save(
+            {
+                "state_dict": self.state_dict(),
+                "vocab": self.vocab,
+                "vocab_fr": self.vocab_fr,
+                "args": self.args,
+                "optimizer": self.optimizer.state_dict(),
+                "epoch": self.args.epochs,
+            },
+            "{0}".format(self.args.outfile),
+        )  # .pt is in input string
+
+    def loss_function(self, g1, g2, p1, p2):
+        g1g2 = self.cosine(g1, g2)
+        g1p1 = self.cosine(g1, p1)
+        g2p2 = self.cosine(g2, p2)
+
+        ones = torch.ones(g1g2.size()[0])
+        if self.gpu:
+            ones = ones.cuda()
+
+        loss = self.sim_loss(g1g2, g1p1, ones) + self.sim_loss(g1g2, g2p2, ones)
+
+        return loss
+
+    def scoring_function(self, g_idxs1, g_lengths1, g_idxs2, g_lengths2, fr0=0, fr1=0):
+        g1 = self.encode(g_idxs1, g_lengths1, fr=fr0)
+        g2 = self.encode(g_idxs2, g_lengths2, fr=fr1)
+        return self.cosine(g1, g2)
+
+    def pair_up_data(self):
+        idx = random.randint(0, self.seg_length)
+        pairs = []
+        for i in self.raw_data:
+            sent = i.sentence
+            sent = sent.split()
+            idx = min(idx, len(sent) - 2)
+            splits = []
+            start = 0
+            while idx < len(sent):
+                seg1 = sent[start:idx]
+                splits.append(seg1)
+                start = idx
+                idx += self.seg_length
+                idx = min(idx, len(sent))
+            if idx > len(sent):
+                seg = sent[start : len(sent)]
+                splits.append(seg)
+            splits = [" ".join(i) for i in splits]
+            random.shuffle(splits)
+            mid = len(splits) // 2
+            pairs.append((Instance(splits[0:mid]), Instance(splits[mid:])))
+        return pairs
+
+
+class Averaging(ParaModel):
+    def __init__(self, data, args, vocab, vocab_fr, model_dir):
+        super(Averaging, self).__init__(data, args, vocab, vocab_fr, model_dir)
+        self.parameters = self.parameters()
+        self.optimizer = optim.Adam(self.parameters, lr=self.args.lr)
+
+        if args.gpu:
+            self.cuda()
+
+        print(self)
+
+    def forward(self, curr_batch):
+        g_idxs1 = curr_batch.g1
+        g_lengths1 = curr_batch.g1_l
+
+        g_idxs2 = curr_batch.g2
+        g_lengths2 = curr_batch.g2_l
+
+        p_idxs1 = curr_batch.p1
+        p_lengths1 = curr_batch.p1_l
+
+        p_idxs2 = curr_batch.p2
+        p_lengths2 = curr_batch.p2_l
+
+        g1 = self.encode(g_idxs1, g_lengths1)
+        g2 = self.encode(g_idxs2, g_lengths2, fr=1)
+        p1 = self.encode(p_idxs1, p_lengths1, fr=1)
+        p2 = self.encode(p_idxs2, p_lengths2)
+
+        return g1, g2, p1, p2
+
+    def encode(self, idxs, lengths, fr=0):
+        if fr and not self.share_vocab:
+            word_embs = self.embedding_fr(idxs)
+        else:
+            word_embs = self.embedding(idxs)
+
+        if self.dropout > 0:
+            F.dropout(word_embs, training=self.training)
+
+        if self.pool == "max":
+            word_embs = max_pool(word_embs, lengths, self.args.gpu)
+        elif self.pool == "mean":
+            word_embs = mean_pool(word_embs, lengths, self.args.gpu)
+
+        return word_embs
+
+
+class LSTM(ParaModel):
+    def __init__(self, data, args, vocab, vocab_fr, model_dir):
+        super(LSTM, self).__init__(data, args, vocab, vocab_fr, model_dir)
+
+        self.hidden_dim = self.args.hidden_dim
+
+        self.e_hidden_init = torch.zeros(2, 1, self.args.hidden_dim)
+        self.e_cell_init = torch.zeros(2, 1, self.args.hidden_dim)
+
+        if self.gpu:
+            self.e_hidden_init = self.e_hidden_init.cuda()
+            self.e_cell_init = self.e_cell_init.cuda()
+
+        self.lstm = nn.LSTM(
+            self.args.dim,
+            self.hidden_dim,
+            num_layers=1,
+            bidirectional=True,
+            batch_first=True,
+        )
+
+        if not self.share_encoder:
+            self.lstm_fr = nn.LSTM(
+                self.args.dim,
+                self.hidden_dim,
+                num_layers=1,
+                bidirectional=True,
+                batch_first=True,
+            )
+
+        self.parameters = self.parameters()
+        self.optimizer = optim.Adam(
+            filter(lambda p: p.requires_grad, self.parameters), self.args.lr
+        )
+
+        if self.gpu:
+            self.cuda()
+
+        print(self)
+
+    def encode(self, inputs, lengths, fr=0):
+        bsz, max_len = inputs.size()
+        e_hidden_init = self.e_hidden_init.expand(2, bsz, self.hidden_dim).contiguous()
+        e_cell_init = self.e_cell_init.expand(2, bsz, self.hidden_dim).contiguous()
+        lens, indices = torch.sort(lengths, 0, True)
+
+        if fr and not self.share_vocab:
+            in_embs = self.embedding_fr(inputs)
+        else:
+            in_embs = self.embedding(inputs)
+
+        if fr and not self.share_encoder:
+            if self.dropout > 0:
+                F.dropout(in_embs, training=self.training)
+            all_hids, (enc_last_hid, _) = self.lstm_fr(
+                pack(in_embs[indices], lens.tolist(), batch_first=True),
+                (e_hidden_init, e_cell_init),
+            )
+        else:
+            if self.dropout > 0:
+                F.dropout(in_embs, training=self.training)
+            all_hids, (enc_last_hid, _) = self.lstm(
+                pack(in_embs[indices], lens.tolist(), batch_first=True),
+                (e_hidden_init, e_cell_init),
+            )
+
+        _, _indices = torch.sort(indices, 0)
+        all_hids = unpack(all_hids, batch_first=True)[0][_indices]
+
+        if self.pool == "max":
+            embs = max_pool(all_hids, lengths, self.gpu)
+        elif self.pool == "mean":
+            embs = mean_pool(all_hids, lengths, self.gpu)
+
+        return embs
+
+    def forward(self, curr_batch):
+        g_idxs1 = curr_batch.g1
+        g_lengths1 = curr_batch.g1_l
+
+        g_idxs2 = curr_batch.g2
+        g_lengths2 = curr_batch.g2_l
+
+        p_idxs1 = curr_batch.p1
+        p_lengths1 = curr_batch.p1_l
+
+        p_idxs2 = curr_batch.p2
+        p_lengths2 = curr_batch.p2_l
+
+        g1 = self.encode(g_idxs1, g_lengths1)
+        g2 = self.encode(g_idxs2, g_lengths2, fr=1)
+        p1 = self.encode(p_idxs1, p_lengths1, fr=1)
+        p2 = self.encode(p_idxs2, p_lengths2)
+
+        return g1, g2, p1, p2