Skip to content

Commit

Permalink
tests and style
Browse files Browse the repository at this point in the history
  • Loading branch information
Vincent committed Oct 30, 2023
1 parent 995fd1c commit 5fc1fb7
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 27 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, sp
from embetter.multi import ClipEncoder

# Finetuning components
from embetter.finetune import ForwardFinetuner, ContrastiveFinetuner
from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner

# External embedding providers, typically needs an API key
from embetter.external import CohereEncoder, OpenAIEncoder
Expand Down
4 changes: 2 additions & 2 deletions docs/finetuners.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ The classes seem to separate much better! That's good news if you'd like to make
It deserves mentioning that the effect on the PCA-space does depend a lot on the chosen hyperparameters of the `ForwardFinertuner`.

```python
tuner = ForwardFinetuner(n_epochs=500, learning_rate=0.01, hidden_dim=10)
tuner = FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=10)
```

If we decrease the hidden dimensions for example then we end up with a space that looks like this:
Expand All @@ -164,7 +164,7 @@ y = df_test['label'].to_list()[:50]
# Let's build a pipeline!
pipe = make_pipeline(
SentenceEncoder(),
ForwardFinetuner(n_epochs=500, learning_rate=0.01, hidden_dim=10),
FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=10),
PCA()
)

Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, sp
from embetter.multi import ClipEncoder

# Finetuning components
from embetter.finetune import ForwardFinetuner, ContrastiveFinetuner
from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner

# External embedding providers, typically needs an API key
from embetter.external import CohereEncoder, OpenAIEncoder
Expand Down
27 changes: 17 additions & 10 deletions embetter/finetune/_constrastive_learn.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import torch
import numpy as np
import torch
import numpy as np

from torch.nn import CosineSimilarity
from torch import nn
Expand All @@ -21,13 +21,14 @@ def forward(self, input1, input2):

def embed(self, X):
return self.embed2(self.act(self.embed1(X)))



class ContrastiveLearner:
"""
A learner model that can finetune on pairs of data on top of numeric embeddings.
It's similar to the scikit-learn models that you're used to, but it accepts
two inputs `X1` and `X2` and tries to predict if they are similar.
two inputs `X1` and `X2` and tries to predict if they are similar.
Arguments:
sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
Expand All @@ -40,7 +41,7 @@ class ContrastiveLearner:
```python
from sentence_transformers import SentenceTransformer
from embetter.finetune import ContrastiveLearner
import random
import random
sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
learner = SbertLearner(sent_tfm)
Expand Down Expand Up @@ -78,7 +79,13 @@ def sample_generator(examples, n_neg=3):
After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
"""

def __init__(self, shape_out:int = 300, batch_size:int = 16, epochs: int=1, learning_rate=2e-05):
def __init__(
self,
shape_out: int = 300,
batch_size: int = 16,
epochs: int = 1,
learning_rate=2e-05,
):
self.learning_rate = learning_rate
self.network_ = None
self.batch_size = batch_size
Expand All @@ -87,11 +94,11 @@ def __init__(self, shape_out:int = 300, batch_size:int = 16, epochs: int=1, lear

def fit(self, X1, X2, y):
"""Finetune an Sbert model based on similarities between two sets of texts."""
self.network_ = ContrastiveNetwork(shape_in=X1.shape[1], hidden_dim=self.shape_out)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
self.network_.parameters(), lr=self.learning_rate
self.network_ = ContrastiveNetwork(
shape_in=X1.shape[1], hidden_dim=self.shape_out
)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(self.network_.parameters(), lr=self.learning_rate)

X1_torch = torch.from_numpy(X1).detach().float()
X2_torch = torch.from_numpy(X2).detach().float()
Expand Down
14 changes: 9 additions & 5 deletions embetter/finetune/_contrastive_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from ._constrastive_learn import ContrastiveLearner


@dataclass
class Example:
"""Internal example class."""
Expand Down Expand Up @@ -71,10 +72,13 @@ class ContrastiveTuner(BaseEstimator, TransformerMixin):
learning_rate: learning rate of the contrastive network
"""

def __init__(
self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001
) -> None:
self.learner = ContrastiveLearner(shape_out=hidden_dim, batch_size=256, learning_rate=learning_rate, epochs=epochs)
def __init__(self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001) -> None:
self.learner = ContrastiveLearner(
shape_out=hidden_dim,
batch_size=256,
learning_rate=learning_rate,
epochs=epochs,
)
self.n_neg = n_neg
self.hidden_dim = hidden_dim
self.epochs = epochs
Expand Down Expand Up @@ -103,7 +107,7 @@ def partial_fit(self, X, y, classes=None):
self._classes = classes

X_torch = torch.from_numpy(X).detach().float()

X1, X2, out = self.generate_batch(X_torch, y=y)
# TODO: change this, we should just generate numpy internally not cast all over
self.learner.fit(np.array(X1), np.array(X2), np.array(out))
Expand Down
2 changes: 1 addition & 1 deletion embetter/finetune/_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class FeedForwardModel(nn.Module):
"""
The internal model for the ForwardFinetuner
The internal model for the FeedForwardTuner
"""

def __init__(self, input_dim, hidden_dim, output_dim):
Expand Down
28 changes: 21 additions & 7 deletions embetter/finetune/_sbert_learn.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import numpy as np
import numpy as np

from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from torch.nn import CosineSimilarity


class SbertLearner:
"""
A learner model that can finetune on pairs of data that leverages SBERT under the hood.
It's similar to the scikit-learn models that you're used to, but it accepts
two inputs `X1` and `X2` and tries to predict if they are similar.
two inputs `X1` and `X2` and tries to predict if they are similar.
Arguments:
sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
Expand All @@ -22,7 +23,7 @@ class SbertLearner:
```python
from sentence_transformers import SentenceTransformer
from embetter.finetune import SbertLearner
import random
import random
sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
learner = SbertLearner(sent_tfm)
Expand Down Expand Up @@ -56,22 +57,35 @@ def sample_generator(examples, n_neg=3):
learner.transform(X1)
learner.transform(X2)
```
After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
"""

def __init__(self, sent_tfm: SentenceTransformer, batch_size:int = 16, epochs: int=1, warmup_steps: int=100):
def __init__(
self,
sent_tfm: SentenceTransformer,
batch_size: int = 16,
epochs: int = 1,
warmup_steps: int = 100,
):
self.sent_tfm = sent_tfm
self.batch_size = batch_size
self.epochs = epochs
self.warmup_steps = warmup_steps

def fit(self, X1, X2, y):
"""Finetune an Sbert model based on similarities between two sets of texts."""
train_examples = [InputExample(texts=[x1, x2], label=float(lab)) for x1, x2, lab in zip(X1, X2, y)]
train_examples = [
InputExample(texts=[x1, x2], label=float(lab))
for x1, x2, lab in zip(X1, X2, y)
]
data_loader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(self.sent_tfm)
self.sent_tfm.fit(train_objectives=[(data_loader, train_loss)], epochs=self.epochs, warmup_steps=self.warmup_steps)
self.sent_tfm.fit(
train_objectives=[(data_loader, train_loss)],
epochs=self.epochs,
warmup_steps=self.warmup_steps,
)
return self

def transform(self, X, y=None):
Expand Down

0 comments on commit 5fc1fb7

Please sign in to comment.