-
Notifications
You must be signed in to change notification settings - Fork 511
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Voyage AI embedding API for Anthropic.
Signed-off-by: Chong Luo <[email protected]>
- Loading branch information
Chong Luo
authored and
Chong Luo
committed
Sep 25, 2024
1 parent
7492681
commit f4c259b
Showing
6 changed files
with
240 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import numpy as np | ||
|
||
from gptcache.utils import import_voyageai | ||
from gptcache.embedding.base import BaseEmbedding | ||
|
||
import_voyageai() | ||
|
||
import voyageai | ||
|
||
|
||
class VoyageAI(BaseEmbedding): | ||
"""Generate text embedding for given text using VoyageAI. | ||
:param model: The model name to use for generating embeddings. Defaults to 'voyage-3'. | ||
:type model: str | ||
:param api_key_path: The path to the VoyageAI API key file. | ||
:type api_key_path: str | ||
:param api_key: The VoyageAI API key. If it is None, the client will search for the API key in the following order: | ||
1. api_key_path, path to the file containing the key; | ||
2. environment variable VOYAGE_API_KEY_PATH, which can be set to the path to the file containing the key; | ||
3. environment variable VOYAGE_API_KEY. | ||
This behavior is defined by the VoyageAI Python SDK. | ||
:type api_key: str | ||
:param input_type: The type of input data. Defaults to None. Default to None. Other options: query, document. | ||
More details can be found in the https://docs.voyageai.com/docs/embeddings | ||
:type input_type: str | ||
:param truncation: Whether to truncate the input data. Defaults to True. | ||
:type truncation: bool | ||
Example: | ||
.. code-block:: python | ||
from gptcache.embedding import VoyageAI | ||
test_sentence = 'Hello, world.' | ||
encoder = VoyageAI(model='voyage-3', api_key='your_voyageai_key') | ||
embed = encoder.to_embeddings(test_sentence) | ||
""" | ||
|
||
def __init__(self, model: str = "voyage-3", api_key_path: str = None, api_key: str = None, input_type: str = None, truncation: bool = True): | ||
voyageai.api_key_path = api_key_path | ||
voyageai.api_key = api_key | ||
|
||
self._vo = voyageai.Client() | ||
self._model = model | ||
self._input_type = input_type | ||
self._truncation = truncation | ||
|
||
if self._model in self.dim_dict(): | ||
self.__dimension = self.dim_dict()[model] | ||
else: | ||
self.__dimension = None | ||
|
||
def to_embeddings(self, data, **_): | ||
""" | ||
Generate embedding for the given text input. | ||
:param data: The input text. | ||
:type data: str or list[str] | ||
:return: The text embedding in the shape of (dim,). | ||
:rtype: numpy.ndarray | ||
""" | ||
if not isinstance(data, list): | ||
data = [data] | ||
result = self._vo.embed(texts=data, model=self._model, input_type=self._input_type, truncation=self._truncation) | ||
embeddings = result.embeddings | ||
return np.array(embeddings).astype("float32").squeeze(0) | ||
|
||
@property | ||
def dimension(self): | ||
"""Embedding dimension. | ||
:return: embedding dimension | ||
""" | ||
if not self.__dimension: | ||
foo_emb = self.to_embeddings("foo") | ||
self.__dimension = len(foo_emb) | ||
return self.__dimension | ||
|
||
@staticmethod | ||
def dim_dict(): | ||
return {"voyage-3": 1024, | ||
"voyage-3-lite": 512, | ||
"voyage-finance-2": 1024, | ||
"voyage-multilingual-2": 1024, | ||
"voyage-law-2": 1024, | ||
"voyage-code-2": 1536} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import os | ||
import types | ||
import pytest | ||
import mock | ||
from gptcache.utils import import_voyageai | ||
from gptcache.embedding import VoyageAI | ||
|
||
import_voyageai() | ||
|
||
|
||
|
||
@mock.patch.dict(os.environ, {"VOYAGE_API_KEY": "API_KEY", "VOYAGE_API_KEY_PATH": "API_KEY_FILE_PATH_ENV"}) | ||
@mock.patch("builtins.open", new_callable=mock.mock_open, read_data="API_KEY") | ||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1024])) | ||
def test_voageai_without_api_key(mock_created, mock_file): | ||
dimension = 1024 | ||
vo = VoyageAI() | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
|
||
mock_file.assert_called_once_with("API_KEY_FILE_PATH_ENV", "rt") | ||
mock_created.assert_called_once_with(texts=["foo"], model="voyage-3", input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch.dict(os.environ, {"VOYAGE_API_KEY": "API_KEY", "VOYAGE_API_KEY_PATH": "API_KEY_FILE_PATH_ENV"}) | ||
@mock.patch("builtins.open", new_callable=mock.mock_open, read_data="API_KEY") | ||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1024])) | ||
def test_voageai_with_api_key_path(mock_create, mock_file): | ||
dimension = 1024 | ||
vo = VoyageAI(api_key_path="API_KEY_FILE_PATH") | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
|
||
mock_file.assert_called_once_with("API_KEY_FILE_PATH", "rt") | ||
mock_create.assert_called_once_with(texts=["foo"], model="voyage-3", input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch.dict(os.environ, {"VOYAGE_API_KEY": "API_KEY"}) | ||
@mock.patch("builtins.open", new_callable=mock.mock_open, read_data="API_KEY") | ||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1024])) | ||
def test_voageai_with_api_key_in_envrion(mock_create, mock_file): | ||
dimension = 1024 | ||
vo = VoyageAI() | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
mock_file.assert_not_called() | ||
mock_create.assert_called_once_with(texts=["foo"], model="voyage-3", input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1024])) | ||
def test_voageai_with_api_key(mock_create): | ||
dimension = 1024 | ||
vo = VoyageAI(api_key="API_KEY") | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
mock_create.assert_called_once_with(texts=["foo"], model="voyage-3", input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch.dict(os.environ) | ||
@mock.patch("builtins.open", new_callable=mock.mock_open, read_data="API_KEY") | ||
def test_voageai_without_api_key_or_api_key_file_path(mock_file): | ||
with pytest.raises(Exception): | ||
VoyageAI() | ||
mock_file.assert_not_called() | ||
|
||
|
||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 512])) | ||
def test_voageai_with_model_voyage_3_lite(mock_create): | ||
dimension = 512 | ||
model = "voyage-3-lite" | ||
vo = VoyageAI(api_key="API_KEY", model=model) | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
mock_create.assert_called_once_with(texts=["foo"], model=model, input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1024])) | ||
def test_voageai_with_model_voyage_finance_2(mock_create): | ||
dimension = 1024 | ||
model = "voyage-finance-2" | ||
vo = VoyageAI(api_key="API_KEY", model=model) | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
mock_create.assert_called_once_with(texts=["foo"], model=model, input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1024])) | ||
def test_voageai_with_model_voyage_multilingual_2(mock_create): | ||
dimension = 1024 | ||
model = "voyage-multilingual-2" | ||
vo = VoyageAI(api_key="API_KEY", model=model) | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
mock_create.assert_called_once_with(texts=["foo"], model=model, input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1024])) | ||
def test_voageai_with_model_voyage_law_2(mock_create): | ||
dimension = 1024 | ||
model = "voyage-law-2" | ||
vo = VoyageAI(api_key="API_KEY", model=model) | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
mock_create.assert_called_once_with(texts=["foo"], model=model, input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1536])) | ||
def test_voageai_with_model_voyage_code_2(mock_create): | ||
dimension = 1536 | ||
model = "voyage-code-2" | ||
vo = VoyageAI(api_key="API_KEY", model=model) | ||
|
||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings("foo")) == dimension | ||
mock_create.assert_called_once_with(texts=["foo"], model=model, input_type=None, truncation=True) | ||
|
||
|
||
@mock.patch("voyageai.Client.embed", return_value=types.SimpleNamespace(embeddings=[[0] * 1536])) | ||
def test_voageai_with_general_parameters(mock_create): | ||
dimension = 1536 | ||
model = "voyage-code-2" | ||
api_key = "API_KEY" | ||
input_type = "query" | ||
truncation = False | ||
|
||
mock_create.return_value = types.SimpleNamespace(embeddings=[[0] * dimension]) | ||
|
||
vo = VoyageAI(model=model, api_key=api_key, input_type=input_type, truncation=truncation) | ||
assert vo.dimension == dimension | ||
assert len(vo.to_embeddings(["foo"])) == dimension | ||
|
||
mock_create.assert_called_once_with(texts=["foo"], model=model, input_type=input_type, truncation=truncation) |