From c9ed5fde5e7bb18132a088df678f23e886d17e1d Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Tue, 24 Sep 2024 13:58:03 +0530
Subject: [PATCH 01/27] Support embeddings generation using llama_cpp

---
 .gitignore                             |   4 +-
 src/distilabel/embeddings/llamacpp.py  | 129 +++++++++++++++++++++++++
 tests/unit/conftest.py                 |  11 +++
 tests/unit/embeddings/test_llamacpp.py |  92 ++++++++++++++++++
 4 files changed, 235 insertions(+), 1 deletion(-)
 create mode 100644 src/distilabel/embeddings/llamacpp.py
 create mode 100644 tests/unit/embeddings/test_llamacpp.py

diff --git a/.gitignore b/.gitignore
index 42967a7edb..93707388c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,4 +77,6 @@ venv.bak/
 # Other
 *.log
 *.swp
-.DS_Store
\ No newline at end of file
+.DS_Store
+#models
+tests/model
diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
new file mode 100644
index 0000000000..66d2c5444b
--- /dev/null
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -0,0 +1,129 @@
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from pydantic import PrivateAttr
+
+from distilabel.embeddings.base import Embeddings
+from distilabel.llms.mixins.cuda_device_placement import CudaDevicePlacementMixin
+
+if TYPE_CHECKING:
+    from llama_cpp import Llama as _LlamaCpp
+
+
+class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
+    """`LlamaCpp` library implementation for embedding generation.
+
+    Attributes:
+        model: the model Hugging Face Hub repo id or a path to a directory containing the
+            model weights and configuration files.
+        hub_repository_id: the Hugging Face Hub repository id.
+        _model: the `Llama` model instance. This attribute is meant to be used internally
+            and should not be accessed directly. It will be set in the `load` method.
+
+    References:
+        - [Offline inference embeddings](https://llama-cpp-python.readthedocs.io/en/stable/#embeddings)
+
+    Examples:
+        Generating sentence embeddings:
+
+        ```python
+        from distilabel.embeddings import LlamaCppEmbeddings
+
+        embeddings = LlamaCppEmbeddings(model="second-state/all-MiniLM-L6-v2-Q2_K.gguf")
+
+        embeddings.load()
+
+        results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
+        # [
+        #   [-0.05447685346007347, -0.01623094454407692, ...],
+        #   [4.4889533455716446e-05, 0.044016145169734955, ...],
+        # ]
+        ```
+    """
+
+    model: str
+    hub_repository_id: Union[None, str] = None
+    disable_cuda_device_placement: bool = True
+    model_kwargs: Optional[Dict[str, Any]] = None
+    verbose: bool = False
+    _model: Union["_LlamaCpp", None] = PrivateAttr(None)
+
+    def load(self) -> None:
+        """Loads the `gguf` model using either the path or the Hugging Face Hub repository id."""
+        super().load()
+
+        CudaDevicePlacementMixin.load(self)
+
+        try:
+            from llama_cpp import Llama as _LlamaCpp
+        except ImportError as ie:
+            raise ImportError(
+                "`llama-cpp-python` package is not installed. Please install it using"
+                " `pip install llama-cpp-python`."
+            ) from ie
+
+        if self.hub_repository_id is not None:
+            self._model = _LlamaCpp.from_pretrained(
+                repo_id=self.hub_repository_id,
+                filename=self.model,
+                verbose=self.verbose,
+                embedding=True,
+            )
+        else:
+            try:
+                self._logger.info(f"Attempting to load model from: {self.model_name}")
+                self._model = _LlamaCpp(
+                    model_path=self.model_name,
+                    verbose=self.verbose,
+                    embedding=True,
+                    kwargs=self.model_kwargs,
+                )
+                self._logger.info(f"self._model: {self._model}")
+                self._logger.info("Model loaded successfully")
+            except Exception as e:
+                self._logger.error(f"Failed to load model: {str(e)}")
+                raise
+
+    def unload(self) -> None:
+        """Unloads the `gguf` model."""
+        CudaDevicePlacementMixin.unload(self)
+        super().unload()
+
+    @property
+    def model_name(self) -> str:
+        """Returns the name of the model."""
+        return self.model
+
+    def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
+        """Generates embeddings for the provided inputs.
+
+        Args:
+            inputs: a list of texts for which an embedding has to be generated.
+
+        Returns:
+            The generated embeddings.
+        """
+        if self._model is None:
+            self._logger.error("Model is not initialized")
+            raise ValueError(
+                "Model is not initialized. Please check the initialization process."
+            )
+
+        try:
+            return self._model.create_embedding(inputs)["data"]
+        except Exception as e:
+            print(f"Error creating embedding: {str(e)}")
+            raise
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 1903d10e3c..eb4eabc58e 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -102,3 +102,14 @@ class DummyTaskOfflineBatchGeneration(DummyTask):
 @pytest.fixture
 def dummy_llm() -> AsyncLLM:
     return DummyAsyncLLM()
+
+
+@pytest.fixture
+def local_llamacpp_model_path():
+    """
+    Fixture that provides the local model path for LlamaCpp testing.
+
+    Returns:
+        str: The path to the local LlamaCpp model file.
+    """
+    return "./tests/model/gguf/all-MiniLM-L6-v2-Q2_K.gguf"
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
new file mode 100644
index 0000000000..9b66230494
--- /dev/null
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -0,0 +1,92 @@
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from distilabel.embeddings.llamacpp import LlamaCppEmbeddings
+
+
+class TestLlamaCppEmbeddings:
+    model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
+    repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
+
+    def test_model_name(self) -> None:
+        """
+        Test if the model name is correctly set.
+        """
+        embeddings = LlamaCppEmbeddings(model=self.model_name)
+        assert embeddings.model_name == self.model_name
+
+    def test_encode(self, local_llamacpp_model_path) -> None:
+        """
+        Test if the model can generate embeddings.
+
+        Args:
+            local_llamacpp_model_path (str): Fixture providing the local model path.
+        """
+        embeddings = LlamaCppEmbeddings(model=local_llamacpp_model_path)
+        inputs = [
+            "Hello, how are you?",
+            "What a nice day!",
+            "I hear that llamas are very popular now.",
+        ]
+        embeddings.load()
+        results = embeddings.encode(inputs=inputs)
+
+        for result in results:
+            assert len(result["embedding"]) == 384
+
+    def test_load_model_from_local(self, local_llamacpp_model_path):
+        """
+        Test if the model can be loaded from a local file and generate embeddings.
+
+        Args:
+            local_llamacpp_model_path (str): Fixture providing the local model path.
+        """
+        embeddings = LlamaCppEmbeddings(model=local_llamacpp_model_path)
+        inputs = [
+            "Hello, how are you?",
+            "What a nice day!",
+            "I hear that llamas are very popular now.",
+        ]
+        embeddings.load()
+        # Test if the model is loaded by generating an embedding
+        results = embeddings.encode(inputs=inputs)
+
+        embeddings.load()
+        results = embeddings.encode(inputs=inputs)
+
+        for result in results:
+            assert len(result["embedding"]) == 384
+
+    def test_load_model_from_repo(self):
+        """
+        Test if the model can be loaded from a Hugging Face repository.
+        """
+        embeddings = LlamaCppEmbeddings(
+            hub_repository_id=self.repo_id, model=self.model_name
+        )
+        inputs = [
+            "Hello, how are you?",
+            "What a nice day!",
+            "I hear that llamas are very popular now.",
+        ]
+
+        embeddings.load()
+        # Test if the model is loaded by generating an embedding
+        results = embeddings.encode(inputs=inputs)
+
+        embeddings.load()
+        results = embeddings.encode(inputs=inputs)
+
+        for result in results:
+            assert len(result["embedding"]) == 384

From c3464bc782b6d6f11a0a18d3ad3eed30a266adef Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Tue, 24 Sep 2024 14:18:33 +0530
Subject: [PATCH 02/27] Added llama-cpp-python as optional dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 44404c683e..7d39bd46b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,6 +94,7 @@ vllm = [
     "setuptools",
 ]
 sentence-transformers = ["sentence-transformers >= 3.0.0"]
+llama_cpp_python = ["llama-cpp-python >= 0.2.90"]
 faiss-cpu = ["faiss-cpu >= 1.8.0"]
 faiss-gpu = ["faiss-gpu >= 1.7.2"]
 text-clustering = [

From 582ca407f6d2f78bbb118de1699a916953ba3311 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Wed, 25 Sep 2024 19:28:30 +0530
Subject: [PATCH 03/27] - Added normalize_embeddings argument to allow user to
 pass if the embeddings should be normalized - Added testcases to test
 normalize embeddings

---
 src/distilabel/embeddings/llamacpp.py  | 86 ++++++++++++++++++++------
 tests/unit/embeddings/test_llamacpp.py | 66 +++++++++++++++++---
 2 files changed, 127 insertions(+), 25 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 66d2c5444b..69de88424f 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Union
 
-from pydantic import PrivateAttr
+from pydantic import Field, PrivateAttr
 
 from distilabel.embeddings.base import Embeddings
 from distilabel.llms.mixins.cuda_device_placement import CudaDevicePlacementMixin
+from distilabel.mixins.runtime_parameters import RuntimeParameter
 
 if TYPE_CHECKING:
     from llama_cpp import Llama as _LlamaCpp
@@ -27,9 +28,14 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     """`LlamaCpp` library implementation for embedding generation.
 
     Attributes:
-        model: the model Hugging Face Hub repo id or a path to a directory containing the
-            model weights and configuration files.
+        model: contains the path to the GGUF quantized model, compatible with the
+            installed version of the `llama.cpp` Python bindings.
         hub_repository_id: the Hugging Face Hub repository id.
+        verbose: whether to print verbose output. Defaults to `False`.
+        disable_cuda_device_placement: whether to disable CUDA device placement. Defaults to `True`.
+        normalize_embeddings: whether to normalize the embeddings. Defaults to `False`.
+        extra_kwargs: additional dictionary of keyword arguments that will be passed to the
+            `Llama` class of `llama_cpp` library. Defaults to `{}`.
         _model: the `Llama` model instance. This attribute is meant to be used internally
             and should not be accessed directly. It will be set in the `load` method.
 
@@ -42,7 +48,11 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         ```python
         from distilabel.embeddings import LlamaCppEmbeddings
 
-        embeddings = LlamaCppEmbeddings(model="second-state/all-MiniLM-L6-v2-Q2_K.gguf")
+        embeddings = LlamaCppEmbeddings(model="/path/to/model.gguf")
+
+        ## Hugging Face Hub
+
+        ## embeddings = LlamaCppEmbeddings(hub_repository_id="second-state/All-MiniLM-L6-v2-Embedding-GGUF", model="all-MiniLM-L6-v2-Q2_K.gguf")
 
         embeddings.load()
 
@@ -54,11 +64,30 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         ```
     """
 
-    model: str
-    hub_repository_id: Union[None, str] = None
-    disable_cuda_device_placement: bool = True
-    model_kwargs: Optional[Dict[str, Any]] = None
-    verbose: bool = False
+    model: RuntimeParameter[str] = Field(
+        default=None,
+        description="Contains the path to the GGUF quantized model, compatible with the installed version of the `llama.cpp` Python bindings.",
+    )
+    hub_repository_id: RuntimeParameter[Union[None, str]] = Field(
+        default=None,
+        description="The Hugging Face Hub repository id.",
+    )
+    disable_cuda_device_placement: RuntimeParameter[bool] = Field(
+        default=True,
+        description="Whether to disable CUDA device placement.",
+    )
+    verbose: RuntimeParameter[bool] = Field(
+        default=False,
+        description="Whether to print verbose output from llama.cpp library.",
+    )
+    extra_kwargs: RuntimeParameter[Dict[str, Any]] = Field(
+        default={},
+        description="Additional dictionary of keyword arguments that will be passed to the `Llama` class of `llama_cpp` library.",
+    )
+    normalize_embeddings: RuntimeParameter[bool] = Field(
+        default=False,
+        description="Whether to normalize the embeddings.",
+    )
     _model: Union["_LlamaCpp", None] = PrivateAttr(None)
 
     def load(self) -> None:
@@ -76,12 +105,32 @@ def load(self) -> None:
             ) from ie
 
         if self.hub_repository_id is not None:
-            self._model = _LlamaCpp.from_pretrained(
-                repo_id=self.hub_repository_id,
-                filename=self.model,
-                verbose=self.verbose,
-                embedding=True,
-            )
+            try:
+                from huggingface_hub.utils import validate_repo_id
+
+                validate_repo_id(self.hub_repository_id)
+            except ImportError as ie:
+                raise ImportError(
+                    "Llama.from_pretrained requires the huggingface-hub package. "
+                    "You can install it with `pip install huggingface-hub`."
+                ) from ie
+            try:
+                self._logger.info(
+                    f"Attempting to load model from Hugging Face Hub: {self.hub_repository_id}"
+                )
+                self._model = _LlamaCpp.from_pretrained(
+                    repo_id=self.hub_repository_id,
+                    filename=self.model,
+                    verbose=self.verbose,
+                    embedding=True,
+                    kwargs=self.extra_kwargs,
+                )
+                self._logger.info("Model loaded successfully from Hugging Face Hub")
+            except Exception as e:
+                self._logger.error(
+                    f"Failed to load model from Hugging Face Hub: {str(e)}"
+                )
+                raise
         else:
             try:
                 self._logger.info(f"Attempting to load model from: {self.model_name}")
@@ -89,7 +138,7 @@ def load(self) -> None:
                     model_path=self.model_name,
                     verbose=self.verbose,
                     embedding=True,
-                    kwargs=self.model_kwargs,
+                    kwargs=self.extra_kwargs,
                 )
                 self._logger.info(f"self._model: {self._model}")
                 self._logger.info("Model loaded successfully")
@@ -123,7 +172,8 @@ def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
             )
 
         try:
-            return self._model.create_embedding(inputs)["data"]
+            embeds = self._model.embed(inputs, normalize=self.normalize_embeddings)
+            return embeds
         except Exception as e:
             print(f"Error creating embedding: {str(e)}")
             raise
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 9b66230494..44782b6178 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+
 from distilabel.embeddings.llamacpp import LlamaCppEmbeddings
 
 
@@ -43,7 +45,7 @@ def test_encode(self, local_llamacpp_model_path) -> None:
         results = embeddings.encode(inputs=inputs)
 
         for result in results:
-            assert len(result["embedding"]) == 384
+            assert len(result) == 384
 
     def test_load_model_from_local(self, local_llamacpp_model_path):
         """
@@ -62,18 +64,17 @@ def test_load_model_from_local(self, local_llamacpp_model_path):
         # Test if the model is loaded by generating an embedding
         results = embeddings.encode(inputs=inputs)
 
-        embeddings.load()
-        results = embeddings.encode(inputs=inputs)
-
         for result in results:
-            assert len(result["embedding"]) == 384
+            assert len(result) == 384
 
     def test_load_model_from_repo(self):
         """
         Test if the model can be loaded from a Hugging Face repository.
         """
         embeddings = LlamaCppEmbeddings(
-            hub_repository_id=self.repo_id, model=self.model_name
+            hub_repository_id=self.repo_id,
+            model=self.model_name,
+            normalize_embeddings=True,
         )
         inputs = [
             "Hello, how are you?",
@@ -85,8 +86,59 @@ def test_load_model_from_repo(self):
         # Test if the model is loaded by generating an embedding
         results = embeddings.encode(inputs=inputs)
 
+        for result in results:
+            assert len(result) == 384
+
+    def test_normalize_embeddings_true(self, local_llamacpp_model_path):
+        """
+        Test if embeddings are normalized when normalize_embeddings is True.
+        """
+        embeddings = LlamaCppEmbeddings(
+            model=local_llamacpp_model_path, normalize_embeddings=True
+        )
+        embeddings.load()
+
+        inputs = [
+            "Hello, how are you?",
+            "What a nice day!",
+            "I hear that llamas are very popular now.",
+        ]
+
+        results = embeddings.encode(inputs=inputs)
+
+        for result in results:
+            # Check if the embedding is normalized (L2 norm should be close to 1)
+            norm = np.linalg.norm(result)
+            assert np.isclose(
+                norm, 1.0, atol=1e-6
+            ), f"Norm is {norm}, expected close to 1.0"
+
+    def test_normalize_embeddings_false(self, local_llamacpp_model_path):
+        """
+        Test if embeddings are not normalized when normalize_embeddings is False.
+        """
+        embeddings = LlamaCppEmbeddings(
+            model=local_llamacpp_model_path, normalize_embeddings=False
+        )
         embeddings.load()
+
+        inputs = [
+            "Hello, how are you?",
+            "What a nice day!",
+            "I hear that llamas are very popular now.",
+        ]
+
         results = embeddings.encode(inputs=inputs)
 
         for result in results:
-            assert len(result["embedding"]) == 384
+            # Check if the embedding is not normalized (L2 norm should not be close to 1)
+            norm = np.linalg.norm(result)
+            assert not np.isclose(
+                norm, 1.0, atol=1e-6
+            ), f"Norm is {norm}, expected not close to 1.0"
+
+        # Additional check: ensure that at least one embedding has a norm significantly different from 1
+        norms = [np.linalg.norm(result) for result in results]
+        assert any(
+            not np.isclose(norm, 1.0, atol=0.1) for norm in norms
+        ), "Expected at least one embedding with norm not close to 1.0"

From fba8adacb9edea4429ab9f9e06c2977bc3876d93 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 26 Sep 2024 08:08:56 +0530
Subject: [PATCH 04/27] Update pyproject.toml

Accept recommended suggestion

Co-authored-by: David Berenstein <david.m.berenstein@gmail.com>
---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7d39bd46b0..44404c683e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,7 +94,6 @@ vllm = [
     "setuptools",
 ]
 sentence-transformers = ["sentence-transformers >= 3.0.0"]
-llama_cpp_python = ["llama-cpp-python >= 0.2.90"]
 faiss-cpu = ["faiss-cpu >= 1.8.0"]
 faiss-gpu = ["faiss-gpu >= 1.7.2"]
 text-clustering = [

From e288b313e000ca375755a6c63f6f828d5849386d Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 26 Sep 2024 09:32:11 +0530
Subject: [PATCH 05/27] - Updated test to allow developer to define test model
 location. - Incorporated changes suggested in review comments.

---
 src/distilabel/embeddings/llamacpp.py | 13 +--------
 src/distilabel/llms/llamacpp.py       |  6 ++---
 tests/unit/conftest.py                | 38 ++++++++++++++++++++++++---
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 69de88424f..6cdecc6eb7 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -165,15 +165,4 @@ def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
         Returns:
             The generated embeddings.
         """
-        if self._model is None:
-            self._logger.error("Model is not initialized")
-            raise ValueError(
-                "Model is not initialized. Please check the initialization process."
-            )
-
-        try:
-            embeds = self._model.embed(inputs, normalize=self.normalize_embeddings)
-            return embeds
-        except Exception as e:
-            print(f"Error creating embedding: {str(e)}")
-            raise
+        return self._model.embed(inputs, normalize=self.normalize_embeddings)
diff --git a/src/distilabel/llms/llamacpp.py b/src/distilabel/llms/llamacpp.py
index 9d158ea525..2d52f56ac8 100644
--- a/src/distilabel/llms/llamacpp.py
+++ b/src/distilabel/llms/llamacpp.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
-from pydantic import Field, FilePath, PrivateAttr, validate_call
+from pydantic import Field, PrivateAttr, validate_call
 
 from distilabel.llms.base import LLM
 from distilabel.llms.typing import GenerateOutput
@@ -110,9 +110,7 @@ class User(BaseModel):
         ```
     """
 
-    model_path: RuntimeParameter[FilePath] = Field(
-        default=None, description="The path to the GGUF quantized model.", exclude=True
-    )
+    model_path: str
     n_gpu_layers: RuntimeParameter[int] = Field(
         default=-1,
         description="The number of layers that will be loaded in the GPU.",
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index eb4eabc58e..f0db9d82a5 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from typing import TYPE_CHECKING, Any, Dict, List, Union
+from urllib.request import urlretrieve
 
 import pytest
 
@@ -105,11 +107,39 @@ def dummy_llm() -> AsyncLLM:
 
 
 @pytest.fixture
-def local_llamacpp_model_path():
+def local_llamacpp_model_path(tmp_path):
     """
-    Fixture that provides the local model path for LlamaCpp testing.
+    Fixture that provides the local model path for LlamaCpp testing and handles cleanup.
 
-    Returns:
+    The model path can be set using the LLAMACPP_TEST_MODEL_PATH environment variable.
+    If not set, it downloads a small test model to a temporary directory and cleans up after the test.
+
+    Args:
+        tmp_path (Path): Pytest fixture providing a temporary directory path.
+
+    Yields:
         str: The path to the local LlamaCpp model file.
     """
-    return "./tests/model/gguf/all-MiniLM-L6-v2-Q2_K.gguf"
+    # Check for environment variable first
+    env_path = os.environ.get("LLAMACPP_TEST_MODEL_PATH")
+    if env_path:
+        yield env_path
+        return  # No cleanup needed if env var is set
+
+    # If env var not set, use a small test model
+    model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
+    model_url = f"https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/{model_name}"
+    model_path = tmp_path / model_name
+
+    if not model_path.exists():
+        print(f"Downloading test model to {model_path}...")
+        urlretrieve(model_url, model_path)
+        print("Download complete.")
+
+    yield str(model_path)
+
+    # Cleanup
+    print(f"Cleaning up downloaded model at {model_path}...")
+    if model_path.exists():
+        os.remove(model_path)
+    print("Cleanup complete.")

From a936a39dd0a1708528bdaf0e0f0d4e8d6bb92438 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 26 Sep 2024 09:48:18 +0530
Subject: [PATCH 06/27] - Made the test session scope - use atexit to
 forcefully invoke cleanup

---
 tests/unit/conftest.py | 52 ++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index f0db9d82a5..8425801499 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import atexit
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 from urllib.request import urlretrieve
@@ -106,29 +107,49 @@ def dummy_llm() -> AsyncLLM:
     return DummyAsyncLLM()
 
 
-@pytest.fixture
-def local_llamacpp_model_path(tmp_path):
+@pytest.fixture(scope="session")
+def local_llamacpp_model_path(tmp_path_factory):
     """
-    Fixture that provides the local model path for LlamaCpp testing and handles cleanup.
+    Session-scoped fixture that provides the local model path for LlamaCpp testing.
 
     The model path can be set using the LLAMACPP_TEST_MODEL_PATH environment variable.
-    If not set, it downloads a small test model to a temporary directory and cleans up after the test.
+    If not set, it downloads a small test model to a temporary directory.
+    The model is downloaded once per test session and cleaned up after all tests.
+
+    To use a custom model:
+    1. Set the LLAMACPP_TEST_MODEL_PATH environment variable to the path of your model file.
+    2. Ensure the model file exists at the specified path.
+
+    Example:
+        export LLAMACPP_TEST_MODEL_PATH="/path/to/your/model.gguf"
 
     Args:
-        tmp_path (Path): Pytest fixture providing a temporary directory path.
+        tmp_path_factory: Pytest fixture providing a temporary directory factory.
 
-    Yields:
+    Returns:
         str: The path to the local LlamaCpp model file.
     """
+    print("\nLlamaCpp model path information:")
+
     # Check for environment variable first
     env_path = os.environ.get("LLAMACPP_TEST_MODEL_PATH")
     if env_path:
-        yield env_path
-        return  # No cleanup needed if env var is set
+        print(f"Using custom model path from LLAMACPP_TEST_MODEL_PATH: {env_path}")
+        if not os.path.exists(env_path):
+            raise FileNotFoundError(
+                f"Custom model file not found at {env_path}. Please ensure the file exists."
+            )
+        return env_path
+
+    print("LLAMACPP_TEST_MODEL_PATH not set. Using default test model.")
+    print(
+        "To use a custom model, set the LLAMACPP_TEST_MODEL_PATH environment variable to the path of your model file."
+    )
 
     # If env var not set, use a small test model
     model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
     model_url = f"https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/{model_name}"
+    tmp_path = tmp_path_factory.getbasetemp()
     model_path = tmp_path / model_name
 
     if not model_path.exists():
@@ -136,10 +157,13 @@ def local_llamacpp_model_path(tmp_path):
         urlretrieve(model_url, model_path)
         print("Download complete.")
 
-    yield str(model_path)
+    def cleanup():
+        if model_path.exists():
+            print(f"Cleaning up downloaded model at {model_path}...")
+            os.remove(model_path)
+            print("Cleanup complete.")
+
+    # Register the cleanup function to be called at exit
+    atexit.register(cleanup)
 
-    # Cleanup
-    print(f"Cleaning up downloaded model at {model_path}...")
-    if model_path.exists():
-        os.remove(model_path)
-    print("Cleanup complete.")
+    return str(model_path)

From 316afa0e6566b0049a923829aadefd56ae949868 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 26 Sep 2024 10:10:59 +0530
Subject: [PATCH 07/27] - Reverted the changes made to model_path

---
 src/distilabel/llms/llamacpp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/distilabel/llms/llamacpp.py b/src/distilabel/llms/llamacpp.py
index 2d52f56ac8..9d158ea525 100644
--- a/src/distilabel/llms/llamacpp.py
+++ b/src/distilabel/llms/llamacpp.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
-from pydantic import Field, PrivateAttr, validate_call
+from pydantic import Field, FilePath, PrivateAttr, validate_call
 
 from distilabel.llms.base import LLM
 from distilabel.llms.typing import GenerateOutput
@@ -110,7 +110,9 @@ class User(BaseModel):
         ```
     """
 
-    model_path: str
+    model_path: RuntimeParameter[FilePath] = Field(
+        default=None, description="The path to the GGUF quantized model.", exclude=True
+    )
     n_gpu_layers: RuntimeParameter[int] = Field(
         default=-1,
         description="The number of layers that will be loaded in the GPU.",

From 71378839136941dd7596f9144d64dcdff3f6da4d Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 26 Sep 2024 10:57:47 +0530
Subject: [PATCH 08/27] - Implement test_encode_batch to verify various batch
 sizes - Add test_encode_batch_consistency to ensure consistent results - Test
 large batch processing capability - Verify embedding dimensions and count for
 different batch sizes

---
 src/distilabel/embeddings/llamacpp.py  | 53 +++++++++++--------
 tests/unit/embeddings/test_llamacpp.py | 70 +++++++++++++++++++++++---
 2 files changed, 95 insertions(+), 28 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 6cdecc6eb7..39c1806f02 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -28,12 +28,16 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     """`LlamaCpp` library implementation for embedding generation.
 
     Attributes:
-        model: contains the path to the GGUF quantized model, compatible with the
+        model_path: contains the path to the GGUF quantized model, compatible with the
             installed version of the `llama.cpp` Python bindings.
-        hub_repository_id: the Hugging Face Hub repository id.
+        repo_id: the Hugging Face Hub repository id.
         verbose: whether to print verbose output. Defaults to `False`.
+        n_gpu_layers: number of layers to run on the GPU. Defaults to `-1` (use the GPU if available).
         disable_cuda_device_placement: whether to disable CUDA device placement. Defaults to `True`.
         normalize_embeddings: whether to normalize the embeddings. Defaults to `False`.
+        seed: RNG seed, -1 for random
+        n_ctx: Text context, 0 = from model
+        n_batch: Prompt processing maximum batch size
         extra_kwargs: additional dictionary of keyword arguments that will be passed to the
             `Llama` class of `llama_cpp` library. Defaults to `{}`.
         _model: the `Llama` model instance. This attribute is meant to be used internally
@@ -52,7 +56,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
         ## Hugging Face Hub
 
-        ## embeddings = LlamaCppEmbeddings(hub_repository_id="second-state/All-MiniLM-L6-v2-Embedding-GGUF", model="all-MiniLM-L6-v2-Q2_K.gguf")
+        ## embeddings = LlamaCppEmbeddings(repo_id="second-state/All-MiniLM-L6-v2-Embedding-GGUF", model="all-MiniLM-L6-v2-Q2_K.gguf")
 
         embeddings.load()
 
@@ -64,14 +68,12 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         ```
     """
 
-    model: RuntimeParameter[str] = Field(
-        default=None,
-        description="Contains the path to the GGUF quantized model, compatible with the installed version of the `llama.cpp` Python bindings.",
-    )
-    hub_repository_id: RuntimeParameter[Union[None, str]] = Field(
+    model_path: str
+    repo_id: RuntimeParameter[Union[None, str]] = Field(
         default=None,
         description="The Hugging Face Hub repository id.",
     )
+    n_gpu_layers: int = 0
     disable_cuda_device_placement: RuntimeParameter[bool] = Field(
         default=True,
         description="Whether to disable CUDA device placement.",
@@ -80,14 +82,17 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         default=False,
         description="Whether to print verbose output from llama.cpp library.",
     )
-    extra_kwargs: RuntimeParameter[Dict[str, Any]] = Field(
-        default={},
-        description="Additional dictionary of keyword arguments that will be passed to the `Llama` class of `llama_cpp` library.",
-    )
     normalize_embeddings: RuntimeParameter[bool] = Field(
         default=False,
         description="Whether to normalize the embeddings.",
     )
+    seed: int = 4294967295
+    n_ctx: int = 512
+    n_batch: int = 512
+    extra_kwargs: RuntimeParameter[Dict[str, Any]] = Field(
+        default={},
+        description="Additional dictionary of keyword arguments that will be passed to the `Llama` class of `llama_cpp` library.",
+    )
     _model: Union["_LlamaCpp", None] = PrivateAttr(None)
 
     def load(self) -> None:
@@ -104,11 +109,11 @@ def load(self) -> None:
                 " `pip install llama-cpp-python`."
             ) from ie
 
-        if self.hub_repository_id is not None:
+        if self.repo_id is not None:
             try:
                 from huggingface_hub.utils import validate_repo_id
 
-                validate_repo_id(self.hub_repository_id)
+                validate_repo_id(self.repo_id)
             except ImportError as ie:
                 raise ImportError(
                     "Llama.from_pretrained requires the huggingface-hub package. "
@@ -116,11 +121,15 @@ def load(self) -> None:
                 ) from ie
             try:
                 self._logger.info(
-                    f"Attempting to load model from Hugging Face Hub: {self.hub_repository_id}"
+                    f"Attempting to load model from Hugging Face Hub: {self.repo_id}"
                 )
                 self._model = _LlamaCpp.from_pretrained(
-                    repo_id=self.hub_repository_id,
-                    filename=self.model,
+                    repo_id=self.repo_id,
+                    filename=self.model_path,
+                    n_gpu_layers=self.n_gpu_layers,
+                    seed=self.seed,
+                    n_ctx=self.n_ctx,
+                    n_batch=self.n_batch,
                     verbose=self.verbose,
                     embedding=True,
                     kwargs=self.extra_kwargs,
@@ -133,9 +142,13 @@ def load(self) -> None:
                 raise
         else:
             try:
-                self._logger.info(f"Attempting to load model from: {self.model_name}")
+                self._logger.info(f"Attempting to load model from: {self.model_path}")
                 self._model = _LlamaCpp(
-                    model_path=self.model_name,
+                    model_path=self.model_path,
+                    seed=self.seed,
+                    n_gpu_layers=self.n_gpu_layers,
+                    n_ctx=self.n_ctx,
+                    n_batch=self.n_batch,
                     verbose=self.verbose,
                     embedding=True,
                     kwargs=self.extra_kwargs,
@@ -154,7 +167,7 @@ def unload(self) -> None:
     @property
     def model_name(self) -> str:
         """Returns the name of the model."""
-        return self.model
+        return self.model_path
 
     def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
         """Generates embeddings for the provided inputs.
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 44782b6178..f7cc9eb1af 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -25,8 +25,8 @@ def test_model_name(self) -> None:
         """
         Test if the model name is correctly set.
         """
-        embeddings = LlamaCppEmbeddings(model=self.model_name)
-        assert embeddings.model_name == self.model_name
+        embeddings = LlamaCppEmbeddings(model_path=self.model_name)
+        assert embeddings.model_path == self.model_name
 
     def test_encode(self, local_llamacpp_model_path) -> None:
         """
@@ -35,7 +35,7 @@ def test_encode(self, local_llamacpp_model_path) -> None:
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
@@ -54,7 +54,7 @@ def test_load_model_from_local(self, local_llamacpp_model_path):
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
@@ -72,8 +72,8 @@ def test_load_model_from_repo(self):
         Test if the model can be loaded from a Hugging Face repository.
         """
         embeddings = LlamaCppEmbeddings(
-            hub_repository_id=self.repo_id,
-            model=self.model_name,
+            repo_id=self.repo_id,
+            model_path=self.model_name,
             normalize_embeddings=True,
         )
         inputs = [
@@ -94,7 +94,7 @@ def test_normalize_embeddings_true(self, local_llamacpp_model_path):
         Test if embeddings are normalized when normalize_embeddings is True.
         """
         embeddings = LlamaCppEmbeddings(
-            model=local_llamacpp_model_path, normalize_embeddings=True
+            model_path=local_llamacpp_model_path, normalize_embeddings=True
         )
         embeddings.load()
 
@@ -118,7 +118,7 @@ def test_normalize_embeddings_false(self, local_llamacpp_model_path):
         Test if embeddings are not normalized when normalize_embeddings is False.
         """
         embeddings = LlamaCppEmbeddings(
-            model=local_llamacpp_model_path, normalize_embeddings=False
+            model_path=local_llamacpp_model_path, normalize_embeddings=False
         )
         embeddings.load()
 
@@ -142,3 +142,57 @@ def test_normalize_embeddings_false(self, local_llamacpp_model_path):
         assert any(
             not np.isclose(norm, 1.0, atol=0.1) for norm in norms
         ), "Expected at least one embedding with norm not close to 1.0"
+
+    def test_encode_batch(self, local_llamacpp_model_path) -> None:
+        """
+        Test if the model can generate embeddings for batches of inputs.
+
+        Args:
+            local_llamacpp_model_path (str): Fixture providing the local model path.
+        """
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
+        embeddings.load()
+
+        # Test with different batch sizes
+        batch_sizes = [1, 2, 5, 10]
+        for batch_size in batch_sizes:
+            inputs = [f"This is test sentence {i}" for i in range(batch_size)]
+            results = embeddings.encode(inputs=inputs)
+
+            assert (
+                len(results) == batch_size
+            ), f"Expected {batch_size} results, got {len(results)}"
+            for result in results:
+                assert (
+                    len(result) == 384
+                ), f"Expected embedding dimension 384, got {len(result)}"
+
+        # Test with a large batch to ensure it doesn't cause issues
+        large_batch = ["Large batch test" for _ in range(100)]
+        large_results = embeddings.encode(inputs=large_batch)
+        assert (
+            len(large_results) == 100
+        ), f"Expected 100 results for large batch, got {len(large_results)}"
+
+    def test_encode_batch_consistency(self, local_llamacpp_model_path) -> None:
+        """
+        Test if the model produces consistent embeddings for the same input in different batch sizes.
+
+        Args:
+            local_llamacpp_model_path (str): Fixture providing the local model path.
+        """
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
+        embeddings.load()
+
+        input_text = "This is a test sentence for consistency"
+
+        # Generate embedding individually
+        single_result = embeddings.encode([input_text])[0]
+
+        # Generate embedding as part of a batch
+        batch_result = embeddings.encode([input_text, "Another sentence"])[0]
+
+        # Compare the embeddings
+        assert np.allclose(
+            single_result, batch_result, atol=1e-5
+        ), "Embeddings are not consistent between single and batch processing"

From 2d0aa76a3ad077e5422389c98188391554032127 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 26 Sep 2024 13:02:39 +0530
Subject: [PATCH 09/27] - Included LlamaCppEmbeddings to __ini__.py

---
 src/distilabel/embeddings/__init__.py  | 2 ++
 tests/unit/embeddings/test_llamacpp.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/distilabel/embeddings/__init__.py b/src/distilabel/embeddings/__init__.py
index 190ea70e50..1b940d0230 100644
--- a/src/distilabel/embeddings/__init__.py
+++ b/src/distilabel/embeddings/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from distilabel.embeddings.base import Embeddings
+from distilabel.embeddings.llamacpp import LlamaCppEmbeddings
 from distilabel.embeddings.sentence_transformers import SentenceTransformerEmbeddings
 from distilabel.embeddings.vllm import vLLMEmbeddings
 
@@ -20,4 +21,5 @@
     "Embeddings",
     "SentenceTransformerEmbeddings",
     "vLLMEmbeddings",
+    "LlamaCppEmbeddings",
 ]
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index f7cc9eb1af..248ce88fdc 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 
-from distilabel.embeddings.llamacpp import LlamaCppEmbeddings
+from distilabel.embeddings import LlamaCppEmbeddings
 
 
 class TestLlamaCppEmbeddings:

From 778532f78bd8ebb3cd7e4705b4fb3ae0d2843af3 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Tue, 1 Oct 2024 03:55:17 +0530
Subject: [PATCH 10/27] - Use HF_TOKEN to download model from hub to generate
 embeddings.

---
 src/distilabel/embeddings/llamacpp.py | 56 ++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 39c1806f02..fc06417b4b 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import tempfile
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 
 from pydantic import Field, PrivateAttr
@@ -31,6 +33,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         model_path: contains the path to the GGUF quantized model, compatible with the
             installed version of the `llama.cpp` Python bindings.
         repo_id: the Hugging Face Hub repository id.
+        hf_token: Hugging Face token for accessing gated models.
         verbose: whether to print verbose output. Defaults to `False`.
         n_gpu_layers: number of layers to run on the GPU. Defaults to `-1` (use the GPU if available).
         disable_cuda_device_placement: whether to disable CUDA device placement. Defaults to `True`.
@@ -73,6 +76,10 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         default=None,
         description="The Hugging Face Hub repository id.",
     )
+    hf_token: RuntimeParameter[Union[None, str]] = Field(
+        default=None,
+        description="Hugging Face token for accessing gated models.",
+    )
     n_gpu_layers: int = 0
     disable_cuda_device_placement: RuntimeParameter[bool] = Field(
         default=True,
@@ -96,7 +103,11 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     _model: Union["_LlamaCpp", None] = PrivateAttr(None)
 
     def load(self) -> None:
-        """Loads the `gguf` model using either the path or the Hugging Face Hub repository id."""
+        """
+        Loads the `gguf` model using either the path or the Hugging Face Hub repository id.
+        If using Hugging Face Hub, the model will be downloaded to a local directory
+        specified by the DISTILABEL_MODEL_DIR environment variable or to a temporary directory.
+        """
         super().load()
 
         CudaDevicePlacementMixin.load(self)
@@ -111,34 +122,52 @@ def load(self) -> None:
 
         if self.repo_id is not None:
             try:
+                from huggingface_hub import hf_hub_download
                 from huggingface_hub.utils import validate_repo_id
-
-                validate_repo_id(self.repo_id)
             except ImportError as ie:
                 raise ImportError(
                     "Llama.from_pretrained requires the huggingface-hub package. "
                     "You can install it with `pip install huggingface-hub`."
                 ) from ie
+
+            validate_repo_id(self.repo_id)
+
+            # Determine the download directory
+            download_dir = os.environ.get("DISTILABEL_MODEL_DIR")
+            if download_dir is None:
+                download_dir = tempfile.gettempdir()
+
+            self._logger.info(
+                f"Attempting to download model from Hugging Face Hub: {self.repo_id}"
+            )
             try:
-                self._logger.info(
-                    f"Attempting to load model from Hugging Face Hub: {self.repo_id}"
-                )
-                self._model = _LlamaCpp.from_pretrained(
+                model_path = hf_hub_download(
                     repo_id=self.repo_id,
                     filename=self.model_path,
+                    token=self.hf_token,
+                    local_dir=download_dir,
+                )
+                self._logger.info(f"Model downloaded successfully to: {model_path}")
+            except Exception as e:
+                self._logger.error(
+                    f"Failed to download model from Hugging Face Hub: {str(e)}"
+                )
+                raise
+
+            try:
+                self._model = _LlamaCpp(
+                    model_path=model_path,
                     n_gpu_layers=self.n_gpu_layers,
                     seed=self.seed,
                     n_ctx=self.n_ctx,
                     n_batch=self.n_batch,
                     verbose=self.verbose,
                     embedding=True,
-                    kwargs=self.extra_kwargs,
+                    **self.extra_kwargs,
                 )
-                self._logger.info("Model loaded successfully from Hugging Face Hub")
+                self._logger.info("Model loaded successfully")
             except Exception as e:
-                self._logger.error(
-                    f"Failed to load model from Hugging Face Hub: {str(e)}"
-                )
+                self._logger.error(f"Failed to load model: {str(e)}")
                 raise
         else:
             try:
@@ -151,9 +180,8 @@ def load(self) -> None:
                     n_batch=self.n_batch,
                     verbose=self.verbose,
                     embedding=True,
-                    kwargs=self.extra_kwargs,
+                    **self.extra_kwargs,
                 )
-                self._logger.info(f"self._model: {self._model}")
                 self._logger.info("Model loaded successfully")
             except Exception as e:
                 self._logger.error(f"Failed to load model: {str(e)}")

From 55c3a0d8f34b8c7e206e0a20e3f09ccc5746a2a7 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Wed, 2 Oct 2024 16:43:21 +0530
Subject: [PATCH 11/27] - Download from hub is now available through mixin

---
 src/distilabel/embeddings/llamacpp.py   | 102 ++++++------------------
 src/distilabel/mixins/hub_downloader.py |  89 +++++++++++++++++++++
 tests/unit/embeddings/test_llamacpp.py  |  18 ++---
 3 files changed, 121 insertions(+), 88 deletions(-)
 create mode 100644 src/distilabel/mixins/hub_downloader.py

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index fc06417b4b..22f12bd024 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -12,21 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 
 from pydantic import Field, PrivateAttr
 
 from distilabel.embeddings.base import Embeddings
 from distilabel.llms.mixins.cuda_device_placement import CudaDevicePlacementMixin
+from distilabel.mixins.hub_downloader import HuggingFaceModelLoaderMixin
 from distilabel.mixins.runtime_parameters import RuntimeParameter
 
 if TYPE_CHECKING:
     from llama_cpp import Llama as _LlamaCpp
 
 
-class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
+class LlamaCppEmbeddings(
+    Embeddings, CudaDevicePlacementMixin, HuggingFaceModelLoaderMixin
+):
     """`LlamaCpp` library implementation for embedding generation.
 
     Attributes:
@@ -71,16 +72,8 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         ```
     """
 
-    model_path: str
-    repo_id: RuntimeParameter[Union[None, str]] = Field(
-        default=None,
-        description="The Hugging Face Hub repository id.",
-    )
-    hf_token: RuntimeParameter[Union[None, str]] = Field(
-        default=None,
-        description="Hugging Face token for accessing gated models.",
-    )
-    n_gpu_layers: int = 0
+    model_file: str
+    n_gpu_layers: RuntimeParameter[int] = Field(default=0, description="Numbe of gpu")
     disable_cuda_device_placement: RuntimeParameter[bool] = Field(
         default=True,
         description="Whether to disable CUDA device placement.",
@@ -120,72 +113,23 @@ def load(self) -> None:
                 " `pip install llama-cpp-python`."
             ) from ie
 
-        if self.repo_id is not None:
-            try:
-                from huggingface_hub import hf_hub_download
-                from huggingface_hub.utils import validate_repo_id
-            except ImportError as ie:
-                raise ImportError(
-                    "Llama.from_pretrained requires the huggingface-hub package. "
-                    "You can install it with `pip install huggingface-hub`."
-                ) from ie
-
-            validate_repo_id(self.repo_id)
-
-            # Determine the download directory
-            download_dir = os.environ.get("DISTILABEL_MODEL_DIR")
-            if download_dir is None:
-                download_dir = tempfile.gettempdir()
-
-            self._logger.info(
-                f"Attempting to download model from Hugging Face Hub: {self.repo_id}"
+        model_path = self.download_model()
+        try:
+            self._logger.info(f"Attempting to load model from: {self.model_file}")
+            self._model = _LlamaCpp(
+                model_path=model_path,
+                seed=self.seed,
+                n_gpu_layers=self.n_gpu_layers,
+                n_ctx=self.n_ctx,
+                n_batch=self.n_batch,
+                verbose=self.verbose,
+                embedding=True,
+                **self.extra_kwargs,
             )
-            try:
-                model_path = hf_hub_download(
-                    repo_id=self.repo_id,
-                    filename=self.model_path,
-                    token=self.hf_token,
-                    local_dir=download_dir,
-                )
-                self._logger.info(f"Model downloaded successfully to: {model_path}")
-            except Exception as e:
-                self._logger.error(
-                    f"Failed to download model from Hugging Face Hub: {str(e)}"
-                )
-                raise
-
-            try:
-                self._model = _LlamaCpp(
-                    model_path=model_path,
-                    n_gpu_layers=self.n_gpu_layers,
-                    seed=self.seed,
-                    n_ctx=self.n_ctx,
-                    n_batch=self.n_batch,
-                    verbose=self.verbose,
-                    embedding=True,
-                    **self.extra_kwargs,
-                )
-                self._logger.info("Model loaded successfully")
-            except Exception as e:
-                self._logger.error(f"Failed to load model: {str(e)}")
-                raise
-        else:
-            try:
-                self._logger.info(f"Attempting to load model from: {self.model_path}")
-                self._model = _LlamaCpp(
-                    model_path=self.model_path,
-                    seed=self.seed,
-                    n_gpu_layers=self.n_gpu_layers,
-                    n_ctx=self.n_ctx,
-                    n_batch=self.n_batch,
-                    verbose=self.verbose,
-                    embedding=True,
-                    **self.extra_kwargs,
-                )
-                self._logger.info("Model loaded successfully")
-            except Exception as e:
-                self._logger.error(f"Failed to load model: {str(e)}")
-                raise
+            self._logger.info("Model loaded successfully")
+        except Exception as e:
+            self._logger.error(f"Failed to load model: {str(e)}")
+            raise
 
     def unload(self) -> None:
         """Unloads the `gguf` model."""
@@ -195,7 +139,7 @@ def unload(self) -> None:
     @property
     def model_name(self) -> str:
         """Returns the name of the model."""
-        return self.model_path
+        return self.model_file
 
     def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
         """Generates embeddings for the provided inputs.
diff --git a/src/distilabel/mixins/hub_downloader.py b/src/distilabel/mixins/hub_downloader.py
new file mode 100644
index 0000000000..f58986fc56
--- /dev/null
+++ b/src/distilabel/mixins/hub_downloader.py
@@ -0,0 +1,89 @@
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class HuggingFaceModelLoaderMixin(BaseModel):
+    """
+    A mixin for downloading models from the Hugging Face Hub.
+
+    Attributes:
+        repo_id (Optional[str]): The Hugging Face Hub repository id.
+        model_file (str): The name of the model file to download.
+        hf_token (Optional[str]): Hugging Face token for accessing gated models.
+    """
+
+    repo_id: Optional[str] = Field(
+        default=None,
+        description="The Hugging Face Hub repository id.",
+    )
+    model_file: str = Field(
+        description="The name of the model file to download.",
+    )
+    hf_token: Optional[str] = Field(
+        default=None,
+        description="Hugging Face token for accessing gated models.",
+    )
+
+    def download_model(self) -> str:
+        """
+        Downloads the model from Hugging Face Hub if repo_id is provided.
+
+        Returns:
+            str: The path to the downloaded or local model file.
+
+        Raises:
+            ImportError: If huggingface_hub is not installed.
+            ValueError: If repo_id is not provided or invalid.
+            Exception: If there's an error downloading or loading the model.
+        """
+        if self.repo_id is None:
+            return self.model_file
+
+        try:
+            from huggingface_hub import hf_hub_download
+            from huggingface_hub.utils import validate_repo_id
+        except ImportError as ie:
+            raise ImportError(
+                "huggingface_hub package is not installed. "
+                "You can install it with `pip install huggingface_hub`."
+            ) from ie
+
+        try:
+            validate_repo_id(self.repo_id)
+        except ValueError as ve:
+            raise ValueError(f"Invalid repo_id: {self.repo_id}") from ve
+
+        # Determine the download directory
+        download_dir = os.environ.get("DISTILABEL_MODEL_DIR")
+        if download_dir is None:
+            download_dir = tempfile.gettempdir()
+
+        try:
+            model_path = hf_hub_download(
+                repo_id=self.repo_id,
+                filename=self.model_file,
+                token=self.hf_token,
+                local_dir=download_dir,
+            )
+            return model_path
+        except Exception as e:
+            raise Exception(
+                f"Failed to download model from Hugging Face Hub: {str(e)}"
+            ) from e
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 248ce88fdc..403250a352 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -25,8 +25,8 @@ def test_model_name(self) -> None:
         """
         Test if the model name is correctly set.
         """
-        embeddings = LlamaCppEmbeddings(model_path=self.model_name)
-        assert embeddings.model_path == self.model_name
+        embeddings = LlamaCppEmbeddings(model_file=self.model_name)
+        assert embeddings.model_file == self.model_name
 
     def test_encode(self, local_llamacpp_model_path) -> None:
         """
@@ -35,7 +35,7 @@ def test_encode(self, local_llamacpp_model_path) -> None:
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
@@ -54,7 +54,7 @@ def test_load_model_from_local(self, local_llamacpp_model_path):
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
@@ -73,7 +73,7 @@ def test_load_model_from_repo(self):
         """
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
-            model_path=self.model_name,
+            model_file=self.model_name,
             normalize_embeddings=True,
         )
         inputs = [
@@ -94,7 +94,7 @@ def test_normalize_embeddings_true(self, local_llamacpp_model_path):
         Test if embeddings are normalized when normalize_embeddings is True.
         """
         embeddings = LlamaCppEmbeddings(
-            model_path=local_llamacpp_model_path, normalize_embeddings=True
+            model_file=local_llamacpp_model_path, normalize_embeddings=True
         )
         embeddings.load()
 
@@ -118,7 +118,7 @@ def test_normalize_embeddings_false(self, local_llamacpp_model_path):
         Test if embeddings are not normalized when normalize_embeddings is False.
         """
         embeddings = LlamaCppEmbeddings(
-            model_path=local_llamacpp_model_path, normalize_embeddings=False
+            model_file=local_llamacpp_model_path, normalize_embeddings=False
         )
         embeddings.load()
 
@@ -150,7 +150,7 @@ def test_encode_batch(self, local_llamacpp_model_path) -> None:
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
         embeddings.load()
 
         # Test with different batch sizes
@@ -181,7 +181,7 @@ def test_encode_batch_consistency(self, local_llamacpp_model_path) -> None:
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
         embeddings.load()
 
         input_text = "This is a test sentence for consistency"

From 935cdb8f484d3ce12c97ba4e28ab4cd0c797a2b2 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 3 Oct 2024 15:43:28 +0530
Subject: [PATCH 12/27] Revert "- Download from hub is now available through
 mixin"

This reverts commit 55c3a0d8f34b8c7e206e0a20e3f09ccc5746a2a7.
---
 src/distilabel/embeddings/llamacpp.py   | 102 ++++++++++++++++++------
 src/distilabel/mixins/hub_downloader.py |  89 ---------------------
 tests/unit/embeddings/test_llamacpp.py  |  18 ++---
 3 files changed, 88 insertions(+), 121 deletions(-)
 delete mode 100644 src/distilabel/mixins/hub_downloader.py

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 22f12bd024..fc06417b4b 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -12,22 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import tempfile
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 
 from pydantic import Field, PrivateAttr
 
 from distilabel.embeddings.base import Embeddings
 from distilabel.llms.mixins.cuda_device_placement import CudaDevicePlacementMixin
-from distilabel.mixins.hub_downloader import HuggingFaceModelLoaderMixin
 from distilabel.mixins.runtime_parameters import RuntimeParameter
 
 if TYPE_CHECKING:
     from llama_cpp import Llama as _LlamaCpp
 
 
-class LlamaCppEmbeddings(
-    Embeddings, CudaDevicePlacementMixin, HuggingFaceModelLoaderMixin
-):
+class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     """`LlamaCpp` library implementation for embedding generation.
 
     Attributes:
@@ -72,8 +71,16 @@ class LlamaCppEmbeddings(
         ```
     """
 
-    model_file: str
-    n_gpu_layers: RuntimeParameter[int] = Field(default=0, description="Numbe of gpu")
+    model_path: str
+    repo_id: RuntimeParameter[Union[None, str]] = Field(
+        default=None,
+        description="The Hugging Face Hub repository id.",
+    )
+    hf_token: RuntimeParameter[Union[None, str]] = Field(
+        default=None,
+        description="Hugging Face token for accessing gated models.",
+    )
+    n_gpu_layers: int = 0
     disable_cuda_device_placement: RuntimeParameter[bool] = Field(
         default=True,
         description="Whether to disable CUDA device placement.",
@@ -113,23 +120,72 @@ def load(self) -> None:
                 " `pip install llama-cpp-python`."
             ) from ie
 
-        model_path = self.download_model()
-        try:
-            self._logger.info(f"Attempting to load model from: {self.model_file}")
-            self._model = _LlamaCpp(
-                model_path=model_path,
-                seed=self.seed,
-                n_gpu_layers=self.n_gpu_layers,
-                n_ctx=self.n_ctx,
-                n_batch=self.n_batch,
-                verbose=self.verbose,
-                embedding=True,
-                **self.extra_kwargs,
+        if self.repo_id is not None:
+            try:
+                from huggingface_hub import hf_hub_download
+                from huggingface_hub.utils import validate_repo_id
+            except ImportError as ie:
+                raise ImportError(
+                    "Llama.from_pretrained requires the huggingface-hub package. "
+                    "You can install it with `pip install huggingface-hub`."
+                ) from ie
+
+            validate_repo_id(self.repo_id)
+
+            # Determine the download directory
+            download_dir = os.environ.get("DISTILABEL_MODEL_DIR")
+            if download_dir is None:
+                download_dir = tempfile.gettempdir()
+
+            self._logger.info(
+                f"Attempting to download model from Hugging Face Hub: {self.repo_id}"
             )
-            self._logger.info("Model loaded successfully")
-        except Exception as e:
-            self._logger.error(f"Failed to load model: {str(e)}")
-            raise
+            try:
+                model_path = hf_hub_download(
+                    repo_id=self.repo_id,
+                    filename=self.model_path,
+                    token=self.hf_token,
+                    local_dir=download_dir,
+                )
+                self._logger.info(f"Model downloaded successfully to: {model_path}")
+            except Exception as e:
+                self._logger.error(
+                    f"Failed to download model from Hugging Face Hub: {str(e)}"
+                )
+                raise
+
+            try:
+                self._model = _LlamaCpp(
+                    model_path=model_path,
+                    n_gpu_layers=self.n_gpu_layers,
+                    seed=self.seed,
+                    n_ctx=self.n_ctx,
+                    n_batch=self.n_batch,
+                    verbose=self.verbose,
+                    embedding=True,
+                    **self.extra_kwargs,
+                )
+                self._logger.info("Model loaded successfully")
+            except Exception as e:
+                self._logger.error(f"Failed to load model: {str(e)}")
+                raise
+        else:
+            try:
+                self._logger.info(f"Attempting to load model from: {self.model_path}")
+                self._model = _LlamaCpp(
+                    model_path=self.model_path,
+                    seed=self.seed,
+                    n_gpu_layers=self.n_gpu_layers,
+                    n_ctx=self.n_ctx,
+                    n_batch=self.n_batch,
+                    verbose=self.verbose,
+                    embedding=True,
+                    **self.extra_kwargs,
+                )
+                self._logger.info("Model loaded successfully")
+            except Exception as e:
+                self._logger.error(f"Failed to load model: {str(e)}")
+                raise
 
     def unload(self) -> None:
         """Unloads the `gguf` model."""
@@ -139,7 +195,7 @@ def unload(self) -> None:
     @property
     def model_name(self) -> str:
         """Returns the name of the model."""
-        return self.model_file
+        return self.model_path
 
     def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
         """Generates embeddings for the provided inputs.
diff --git a/src/distilabel/mixins/hub_downloader.py b/src/distilabel/mixins/hub_downloader.py
deleted file mode 100644
index f58986fc56..0000000000
--- a/src/distilabel/mixins/hub_downloader.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from typing import Optional
-
-from pydantic import BaseModel, Field
-
-
-class HuggingFaceModelLoaderMixin(BaseModel):
-    """
-    A mixin for downloading models from the Hugging Face Hub.
-
-    Attributes:
-        repo_id (Optional[str]): The Hugging Face Hub repository id.
-        model_file (str): The name of the model file to download.
-        hf_token (Optional[str]): Hugging Face token for accessing gated models.
-    """
-
-    repo_id: Optional[str] = Field(
-        default=None,
-        description="The Hugging Face Hub repository id.",
-    )
-    model_file: str = Field(
-        description="The name of the model file to download.",
-    )
-    hf_token: Optional[str] = Field(
-        default=None,
-        description="Hugging Face token for accessing gated models.",
-    )
-
-    def download_model(self) -> str:
-        """
-        Downloads the model from Hugging Face Hub if repo_id is provided.
-
-        Returns:
-            str: The path to the downloaded or local model file.
-
-        Raises:
-            ImportError: If huggingface_hub is not installed.
-            ValueError: If repo_id is not provided or invalid.
-            Exception: If there's an error downloading or loading the model.
-        """
-        if self.repo_id is None:
-            return self.model_file
-
-        try:
-            from huggingface_hub import hf_hub_download
-            from huggingface_hub.utils import validate_repo_id
-        except ImportError as ie:
-            raise ImportError(
-                "huggingface_hub package is not installed. "
-                "You can install it with `pip install huggingface_hub`."
-            ) from ie
-
-        try:
-            validate_repo_id(self.repo_id)
-        except ValueError as ve:
-            raise ValueError(f"Invalid repo_id: {self.repo_id}") from ve
-
-        # Determine the download directory
-        download_dir = os.environ.get("DISTILABEL_MODEL_DIR")
-        if download_dir is None:
-            download_dir = tempfile.gettempdir()
-
-        try:
-            model_path = hf_hub_download(
-                repo_id=self.repo_id,
-                filename=self.model_file,
-                token=self.hf_token,
-                local_dir=download_dir,
-            )
-            return model_path
-        except Exception as e:
-            raise Exception(
-                f"Failed to download model from Hugging Face Hub: {str(e)}"
-            ) from e
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 403250a352..248ce88fdc 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -25,8 +25,8 @@ def test_model_name(self) -> None:
         """
         Test if the model name is correctly set.
         """
-        embeddings = LlamaCppEmbeddings(model_file=self.model_name)
-        assert embeddings.model_file == self.model_name
+        embeddings = LlamaCppEmbeddings(model_path=self.model_name)
+        assert embeddings.model_path == self.model_name
 
     def test_encode(self, local_llamacpp_model_path) -> None:
         """
@@ -35,7 +35,7 @@ def test_encode(self, local_llamacpp_model_path) -> None:
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
@@ -54,7 +54,7 @@ def test_load_model_from_local(self, local_llamacpp_model_path):
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
@@ -73,7 +73,7 @@ def test_load_model_from_repo(self):
         """
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
-            model_file=self.model_name,
+            model_path=self.model_name,
             normalize_embeddings=True,
         )
         inputs = [
@@ -94,7 +94,7 @@ def test_normalize_embeddings_true(self, local_llamacpp_model_path):
         Test if embeddings are normalized when normalize_embeddings is True.
         """
         embeddings = LlamaCppEmbeddings(
-            model_file=local_llamacpp_model_path, normalize_embeddings=True
+            model_path=local_llamacpp_model_path, normalize_embeddings=True
         )
         embeddings.load()
 
@@ -118,7 +118,7 @@ def test_normalize_embeddings_false(self, local_llamacpp_model_path):
         Test if embeddings are not normalized when normalize_embeddings is False.
         """
         embeddings = LlamaCppEmbeddings(
-            model_file=local_llamacpp_model_path, normalize_embeddings=False
+            model_path=local_llamacpp_model_path, normalize_embeddings=False
         )
         embeddings.load()
 
@@ -150,7 +150,7 @@ def test_encode_batch(self, local_llamacpp_model_path) -> None:
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
         embeddings.load()
 
         # Test with different batch sizes
@@ -181,7 +181,7 @@ def test_encode_batch_consistency(self, local_llamacpp_model_path) -> None:
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_file=local_llamacpp_model_path)
+        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
         embeddings.load()
 
         input_text = "This is a test sentence for consistency"

From 29a8d56817193c827596ad46246fd41d928df871 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 3 Oct 2024 15:50:31 +0530
Subject: [PATCH 13/27] Revert "- Use HF_TOKEN to download model from hub to
 generate embeddings."

This reverts commit 778532f78bd8ebb3cd7e4705b4fb3ae0d2843af3.
HF_TOKEN can be set as env variable to download gated model
---
 src/distilabel/embeddings/llamacpp.py | 55 ++++-----------------------
 1 file changed, 8 insertions(+), 47 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index fc06417b4b..53c9d550d8 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 
 from pydantic import Field, PrivateAttr
@@ -33,7 +31,6 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         model_path: contains the path to the GGUF quantized model, compatible with the
             installed version of the `llama.cpp` Python bindings.
         repo_id: the Hugging Face Hub repository id.
-        hf_token: Hugging Face token for accessing gated models.
         verbose: whether to print verbose output. Defaults to `False`.
         n_gpu_layers: number of layers to run on the GPU. Defaults to `-1` (use the GPU if available).
         disable_cuda_device_placement: whether to disable CUDA device placement. Defaults to `True`.
@@ -76,10 +73,6 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         default=None,
         description="The Hugging Face Hub repository id.",
     )
-    hf_token: RuntimeParameter[Union[None, str]] = Field(
-        default=None,
-        description="Hugging Face token for accessing gated models.",
-    )
     n_gpu_layers: int = 0
     disable_cuda_device_placement: RuntimeParameter[bool] = Field(
         default=True,
@@ -103,11 +96,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     _model: Union["_LlamaCpp", None] = PrivateAttr(None)
 
     def load(self) -> None:
-        """
-        Loads the `gguf` model using either the path or the Hugging Face Hub repository id.
-        If using Hugging Face Hub, the model will be downloaded to a local directory
-        specified by the DISTILABEL_MODEL_DIR environment variable or to a temporary directory.
-        """
+        """Loads the `gguf` model using either the path or the Hugging Face Hub repository id."""
         super().load()
 
         CudaDevicePlacementMixin.load(self)
@@ -122,56 +111,30 @@ def load(self) -> None:
 
         if self.repo_id is not None:
             try:
-                from huggingface_hub import hf_hub_download
                 from huggingface_hub.utils import validate_repo_id
+
+                validate_repo_id(self.repo_id)
             except ImportError as ie:
                 raise ImportError(
                     "Llama.from_pretrained requires the huggingface-hub package. "
                     "You can install it with `pip install huggingface-hub`."
                 ) from ie
-
-            validate_repo_id(self.repo_id)
-
-            # Determine the download directory
-            download_dir = os.environ.get("DISTILABEL_MODEL_DIR")
-            if download_dir is None:
-                download_dir = tempfile.gettempdir()
-
-            self._logger.info(
-                f"Attempting to download model from Hugging Face Hub: {self.repo_id}"
-            )
             try:
-                model_path = hf_hub_download(
+                self._model = _LlamaCpp.from_pretrained(
                     repo_id=self.repo_id,
                     filename=self.model_path,
-                    token=self.hf_token,
-                    local_dir=download_dir,
-                )
-                self._logger.info(f"Model downloaded successfully to: {model_path}")
-            except Exception as e:
-                self._logger.error(
-                    f"Failed to download model from Hugging Face Hub: {str(e)}"
-                )
-                raise
-
-            try:
-                self._model = _LlamaCpp(
-                    model_path=model_path,
                     n_gpu_layers=self.n_gpu_layers,
                     seed=self.seed,
                     n_ctx=self.n_ctx,
                     n_batch=self.n_batch,
                     verbose=self.verbose,
                     embedding=True,
-                    **self.extra_kwargs,
+                    kwargs=self.extra_kwargs,
                 )
-                self._logger.info("Model loaded successfully")
-            except Exception as e:
-                self._logger.error(f"Failed to load model: {str(e)}")
+            except Exception:
                 raise
         else:
             try:
-                self._logger.info(f"Attempting to load model from: {self.model_path}")
                 self._model = _LlamaCpp(
                     model_path=self.model_path,
                     seed=self.seed,
@@ -180,11 +143,9 @@ def load(self) -> None:
                     n_batch=self.n_batch,
                     verbose=self.verbose,
                     embedding=True,
-                    **self.extra_kwargs,
+                    kwargs=self.extra_kwargs,
                 )
-                self._logger.info("Model loaded successfully")
-            except Exception as e:
-                self._logger.error(f"Failed to load model: {str(e)}")
+            except Exception:
                 raise
 
     def unload(self) -> None:

From b40b0d267b031a670c480739656e6a1ce071b50a Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Thu, 3 Oct 2024 20:40:46 +0530
Subject: [PATCH 14/27] - Removed mixin implemenation to download the model -
 alligned the attribute as per the review comments.

---
 src/distilabel/embeddings/llamacpp.py  | 93 ++++++++++++++++++--------
 tests/unit/embeddings/test_llamacpp.py | 11 +--
 2 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 53c9d550d8..e90ea24d45 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import TYPE_CHECKING, Any, Dict, List, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from pydantic import Field, PrivateAttr
 
@@ -21,7 +21,7 @@
 from distilabel.mixins.runtime_parameters import RuntimeParameter
 
 if TYPE_CHECKING:
-    from llama_cpp import Llama as _LlamaCpp
+    from llama_cpp import Llama
 
 
 class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
@@ -43,20 +43,51 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         _model: the `Llama` model instance. This attribute is meant to be used internally
             and should not be accessed directly. It will be set in the `load` method.
 
+    Runtime parameters:
+        - `n_gpu_layers`: the number of layers to use for the GPU. Defaults to `-1`.
+        - `verbose`: whether to print verbose output. Defaults to `False`.
+        - `normalize_embeddings`: whether to normalize the embeddings. Defaults to `False`.
+        - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to the
+            `Llama` class of `llama_cpp` library. Defaults to `{}`.
     References:
         - [Offline inference embeddings](https://llama-cpp-python.readthedocs.io/en/stable/#embeddings)
 
     Examples:
-        Generating sentence embeddings:
+        Generating sentence embeddings using a local model:
 
         ```python
+        from pathlib import Path
         from distilabel.embeddings import LlamaCppEmbeddings
 
-        embeddings = LlamaCppEmbeddings(model="/path/to/model.gguf")
+        # You can follow along this example downloading the following model running the following
+        # command in the terminal, that will download the model to the `Downloads` folder:
+        # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
+
+        model_path = "Downloads/all-MiniLM-L6-v2-Q2_K.gguf"
+        embeddings = LlamaCppEmbeddings(model_path=str(Path.home() / model_path))
+
+        embeddings.load()
+
+        results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
+        # [
+        #   [-0.05447685346007347, -0.01623094454407692, ...],
+        #   [4.4889533455716446e-05, 0.044016145169734955, ...],
+        # ]
+        ```
+
+        Generating sentence embeddings using a Hugging Face Hub model:
 
-        ## Hugging Face Hub
+        ```python
+        from pathlib import Path
+        from distilabel.embeddings import LlamaCppEmbeddings
 
-        ## embeddings = LlamaCppEmbeddings(repo_id="second-state/All-MiniLM-L6-v2-Embedding-GGUF", model="all-MiniLM-L6-v2-Q2_K.gguf")
+        # You can follow along this example downloading the following model running the following
+        # command in the terminal, that will download the model to the `Downloads` folder:
+        # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
+
+        repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
+        model_path = "all-MiniLM-L6-v2-Q5_K_M.gguf"
+        embeddings = LlamaCppEmbeddings(repo_id=repo_id,model_path=model_path)
 
         embeddings.load()
 
@@ -69,40 +100,44 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     """
 
     model_path: str
-    repo_id: RuntimeParameter[Union[None, str]] = Field(
-        default=None,
-        description="The Hugging Face Hub repository id.",
-    )
-    n_gpu_layers: int = 0
-    disable_cuda_device_placement: RuntimeParameter[bool] = Field(
-        default=True,
-        description="Whether to disable CUDA device placement.",
+
+    repo_id: RuntimeParameter[str] = Field(
+        default=None, description="The Hugging Face Hub repository id.", exclude=True
     )
-    verbose: RuntimeParameter[bool] = Field(
-        default=False,
-        description="Whether to print verbose output from llama.cpp library.",
+
+    n_gpu_layers: RuntimeParameter[int] = Field(
+        default=0,
+        description="The number of layers that will be loaded in the GPU.",
     )
+
+    n_ctx: int = 512
+    n_batch: int = 512
+    seed: int = 4294967295
+
     normalize_embeddings: RuntimeParameter[bool] = Field(
         default=False,
         description="Whether to normalize the embeddings.",
     )
-    seed: int = 4294967295
-    n_ctx: int = 512
-    n_batch: int = 512
-    extra_kwargs: RuntimeParameter[Dict[str, Any]] = Field(
-        default={},
-        description="Additional dictionary of keyword arguments that will be passed to the `Llama` class of `llama_cpp` library.",
+    verbose: RuntimeParameter[bool] = Field(
+        default=False,
+        description="Whether to print verbose output from llama.cpp library.",
     )
-    _model: Union["_LlamaCpp", None] = PrivateAttr(None)
+    extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(
+        default_factory=dict,
+        description="Additional dictionary of keyword arguments that will be passed to the"
+        " `Llama` class of `llama_cpp` library. See all the supported arguments at: "
+        "https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__",
+    )
+    _model: Optional["Llama"] = PrivateAttr(...)
 
     def load(self) -> None:
         """Loads the `gguf` model using either the path or the Hugging Face Hub repository id."""
         super().load()
-
+        self.disable_cuda_device_placement = True
         CudaDevicePlacementMixin.load(self)
 
         try:
-            from llama_cpp import Llama as _LlamaCpp
+            from llama_cpp import Llama
         except ImportError as ie:
             raise ImportError(
                 "`llama-cpp-python` package is not installed. Please install it using"
@@ -120,7 +155,7 @@ def load(self) -> None:
                     "You can install it with `pip install huggingface-hub`."
                 ) from ie
             try:
-                self._model = _LlamaCpp.from_pretrained(
+                self._model = Llama.from_pretrained(
                     repo_id=self.repo_id,
                     filename=self.model_path,
                     n_gpu_layers=self.n_gpu_layers,
@@ -135,10 +170,10 @@ def load(self) -> None:
                 raise
         else:
             try:
-                self._model = _LlamaCpp(
+                self._model = Llama(
                     model_path=self.model_path,
-                    seed=self.seed,
                     n_gpu_layers=self.n_gpu_layers,
+                    seed=self.seed,
                     n_ctx=self.n_ctx,
                     n_batch=self.n_batch,
                     verbose=self.verbose,
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 248ce88fdc..10885304aa 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -12,21 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
+
 import numpy as np
 
 from distilabel.embeddings import LlamaCppEmbeddings
 
 
 class TestLlamaCppEmbeddings:
-    model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
+    model_path = "Downloads/all-MiniLM-L6-v2-Q2_K.gguf"
     repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
+    hub_model = "all-MiniLM-L6-v2-Q5_K_M.gguf"
 
     def test_model_name(self) -> None:
         """
         Test if the model name is correctly set.
         """
-        embeddings = LlamaCppEmbeddings(model_path=self.model_name)
-        assert embeddings.model_path == self.model_name
+        embeddings = LlamaCppEmbeddings(model_path=str(Path.home() / self.model_path))
+        assert embeddings.model_name == str(Path.home() / self.model_path)
 
     def test_encode(self, local_llamacpp_model_path) -> None:
         """
@@ -73,8 +76,8 @@ def test_load_model_from_repo(self):
         """
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
-            model_path=self.model_name,
             normalize_embeddings=True,
+            model_path=self.hub_model,
         )
         inputs = [
             "Hello, how are you?",

From b08f3aed05ca2baf51cec3d1ee4c25055ba2e9a9 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Fri, 4 Oct 2024 08:10:16 +0530
Subject: [PATCH 15/27] - Additional example added for private / public model

---
 src/distilabel/embeddings/llamacpp.py | 33 ++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index e90ea24d45..2dc640eaba 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -53,7 +53,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         - [Offline inference embeddings](https://llama-cpp-python.readthedocs.io/en/stable/#embeddings)
 
     Examples:
-        Generating sentence embeddings using a local model:
+        Generate sentence embeddings using a local model:
 
         ```python
         from pathlib import Path
@@ -75,16 +75,12 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         # ]
         ```
 
-        Generating sentence embeddings using a Hugging Face Hub model:
+        Generate sentence embeddings using a HuggingFace Hub public model:
 
         ```python
         from pathlib import Path
         from distilabel.embeddings import LlamaCppEmbeddings
 
-        # You can follow along this example downloading the following model running the following
-        # command in the terminal, that will download the model to the `Downloads` folder:
-        # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
-
         repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
         model_path = "all-MiniLM-L6-v2-Q5_K_M.gguf"
         embeddings = LlamaCppEmbeddings(repo_id=repo_id,model_path=model_path)
@@ -97,6 +93,31 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         #   [4.4889533455716446e-05, 0.044016145169734955, ...],
         # ]
         ```
+
+        Generate sentence embeddings using a HuggingFace Hub private model:
+
+        ```python
+        from pathlib import Path
+        from distilabel.embeddings import LlamaCppEmbeddings
+
+        # You need to set environment variable to download private model to the local machine
+        os.environ["HF_TOKEN"] = "hf_..."
+
+        repo_id = "private_repo_id"
+        model_path = "model"
+        embeddings = LlamaCppEmbeddings(repo_id=repo_id,model_path=model_path)
+
+        embeddings.load()
+
+        results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
+        # [
+        #   [-0.05447685346007347, -0.01623094454407692, ...],
+        #   [4.4889533455716446e-05, 0.044016145169734955, ...],
+        # ]
+        ```
+
+
+
     """
 
     model_path: str

From a49363cb39108724803ac40d2251702f1fc6e093 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Fri, 4 Oct 2024 09:50:19 +0530
Subject: [PATCH 16/27] - The tests can now be configured to use cpu or gpu
 based on parameter --cpu-only. `pytest tests/unit/embeddings/test_llamacpp.py
 --cpu-only` will generate embeddings using cpu `pytest
 tests/unit/embeddings/test_llamacpp.py` will generate embeddings using gpu

---
 src/distilabel/embeddings/llamacpp.py  | 22 ++++++-
 tests/unit/conftest.py                 | 17 +++++
 tests/unit/embeddings/test_llamacpp.py | 90 +++++++++++++-------------
 3 files changed, 84 insertions(+), 45 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 2dc640eaba..4bf12ed2ed 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -115,7 +115,27 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         #   [4.4889533455716446e-05, 0.044016145169734955, ...],
         # ]
         ```
+        Generate sentence embeddings with cpu:
 
+        ```python
+        from pathlib import Path
+        from distilabel.embeddings import LlamaCppEmbeddings
+
+        # You can follow along this example downloading the following model running the following
+        # command in the terminal, that will download the model to the `Downloads` folder:
+        # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
+
+        model_path = "Downloads/all-MiniLM-L6-v2-Q2_K.gguf"
+        embeddings = LlamaCppEmbeddings(model_path=str(Path.home() / model_path), n_gpu_layers=0)
+
+        embeddings.load()
+
+        results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
+        # [
+        #   [-0.05447685346007347, -0.01623094454407692, ...],
+        #   [4.4889533455716446e-05, 0.044016145169734955, ...],
+        # ]
+        ```
 
 
     """
@@ -127,7 +147,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     )
 
     n_gpu_layers: RuntimeParameter[int] = Field(
-        default=0,
+        default=-1,
         description="The number of layers that will be loaded in the GPU.",
     )
 
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 8425801499..3d4d86c875 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -167,3 +167,20 @@ def cleanup():
     atexit.register(cleanup)
 
     return str(model_path)
+
+
+def pytest_addoption(parser):
+    """
+    Add a command-line option to pytest for CPU-only testing.
+    """
+    parser.addoption(
+        "--cpu-only", action="store", default=False, help="Run tests on CPU only"
+    )
+
+
+@pytest.fixture
+def use_cpu(request):
+    """
+    Fixture to determine whether to use CPU based on command-line option.
+    """
+    return request.config.getoption("--cpu-only")
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 10885304aa..4b426ae3bc 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -12,72 +12,85 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pathlib import Path
 
 import numpy as np
+import pytest
 
 from distilabel.embeddings import LlamaCppEmbeddings
 
+"""
+To test with CPU only, run the following command:
+pytest tests/unit/embeddings/test_llamacpp.py --cpu-only
+
+"""
+
 
 class TestLlamaCppEmbeddings:
     model_path = "Downloads/all-MiniLM-L6-v2-Q2_K.gguf"
     repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
     hub_model = "all-MiniLM-L6-v2-Q5_K_M.gguf"
 
-    def test_model_name(self) -> None:
+    @pytest.fixture(autouse=True)
+    def setup_embeddings(self, local_llamacpp_model_path, use_cpu):
+        """
+        Fixture to set up embeddings for each test, considering CPU usage.
+        """
+        n_gpu_layers = 0 if use_cpu else -1
+        self.embeddings = LlamaCppEmbeddings(
+            model_path=local_llamacpp_model_path, n_gpu_layers=n_gpu_layers
+        )
+        self.embeddings.load()
+
+    def test_model_name(self, local_llamacpp_model_path) -> None:
         """
         Test if the model name is correctly set.
         """
-        embeddings = LlamaCppEmbeddings(model_path=str(Path.home() / self.model_path))
-        assert embeddings.model_name == str(Path.home() / self.model_path)
+        assert self.embeddings.model_name == local_llamacpp_model_path
 
-    def test_encode(self, local_llamacpp_model_path) -> None:
+    def test_encode(self) -> None:
         """
         Test if the model can generate embeddings.
-
-        Args:
-            local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
             "I hear that llamas are very popular now.",
         ]
-        embeddings.load()
-        results = embeddings.encode(inputs=inputs)
+        results = self.embeddings.encode(inputs=inputs)
 
         for result in results:
             assert len(result) == 384
 
-    def test_load_model_from_local(self, local_llamacpp_model_path):
+    def test_load_model_from_local(self):
         """
         Test if the model can be loaded from a local file and generate embeddings.
 
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
+
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
             "I hear that llamas are very popular now.",
         ]
-        embeddings.load()
+
         # Test if the model is loaded by generating an embedding
-        results = embeddings.encode(inputs=inputs)
+        results = self.embeddings.encode(inputs=inputs)
 
         for result in results:
             assert len(result) == 384
 
-    def test_load_model_from_repo(self):
+    def test_load_model_from_repo(self, use_cpu):
         """
         Test if the model can be loaded from a Hugging Face repository.
         """
+        n_gpu_layers = 0 if use_cpu else -1
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
             normalize_embeddings=True,
             model_path=self.hub_model,
+            n_gpu_layers=n_gpu_layers,
         )
         inputs = [
             "Hello, how are you?",
@@ -92,21 +105,23 @@ def test_load_model_from_repo(self):
         for result in results:
             assert len(result) == 384
 
-    def test_normalize_embeddings_true(self, local_llamacpp_model_path):
+    def test_normalize_embeddings(self, use_cpu):
         """
         Test if embeddings are normalized when normalize_embeddings is True.
         """
-        embeddings = LlamaCppEmbeddings(
-            model_path=local_llamacpp_model_path, normalize_embeddings=True
-        )
-        embeddings.load()
-
         inputs = [
             "Hello, how are you?",
             "What a nice day!",
             "I hear that llamas are very popular now.",
         ]
-
+        n_gpu_layers = 0 if use_cpu else -1
+        embeddings = LlamaCppEmbeddings(
+            repo_id=self.repo_id,
+            normalize_embeddings=True,
+            model_path=self.hub_model,
+            n_gpu_layers=n_gpu_layers,
+        )
+        embeddings.load()
         results = embeddings.encode(inputs=inputs)
 
         for result in results:
@@ -116,14 +131,10 @@ def test_normalize_embeddings_true(self, local_llamacpp_model_path):
                 norm, 1.0, atol=1e-6
             ), f"Norm is {norm}, expected close to 1.0"
 
-    def test_normalize_embeddings_false(self, local_llamacpp_model_path):
+    def test_normalize_embeddings_false(self):
         """
         Test if embeddings are not normalized when normalize_embeddings is False.
         """
-        embeddings = LlamaCppEmbeddings(
-            model_path=local_llamacpp_model_path, normalize_embeddings=False
-        )
-        embeddings.load()
 
         inputs = [
             "Hello, how are you?",
@@ -131,7 +142,7 @@ def test_normalize_embeddings_false(self, local_llamacpp_model_path):
             "I hear that llamas are very popular now.",
         ]
 
-        results = embeddings.encode(inputs=inputs)
+        results = self.embeddings.encode(inputs=inputs)
 
         for result in results:
             # Check if the embedding is not normalized (L2 norm should not be close to 1)
@@ -146,21 +157,15 @@ def test_normalize_embeddings_false(self, local_llamacpp_model_path):
             not np.isclose(norm, 1.0, atol=0.1) for norm in norms
         ), "Expected at least one embedding with norm not close to 1.0"
 
-    def test_encode_batch(self, local_llamacpp_model_path) -> None:
+    def test_encode_batch(self) -> None:
         """
         Test if the model can generate embeddings for batches of inputs.
-
-        Args:
-            local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
-        embeddings.load()
-
         # Test with different batch sizes
         batch_sizes = [1, 2, 5, 10]
         for batch_size in batch_sizes:
             inputs = [f"This is test sentence {i}" for i in range(batch_size)]
-            results = embeddings.encode(inputs=inputs)
+            results = self.embeddings.encode(inputs=inputs)
 
             assert (
                 len(results) == batch_size
@@ -172,28 +177,25 @@ def test_encode_batch(self, local_llamacpp_model_path) -> None:
 
         # Test with a large batch to ensure it doesn't cause issues
         large_batch = ["Large batch test" for _ in range(100)]
-        large_results = embeddings.encode(inputs=large_batch)
+        large_results = self.embeddings.encode(inputs=large_batch)
         assert (
             len(large_results) == 100
         ), f"Expected 100 results for large batch, got {len(large_results)}"
 
-    def test_encode_batch_consistency(self, local_llamacpp_model_path) -> None:
+    def test_encode_batch_consistency(self) -> None:
         """
         Test if the model produces consistent embeddings for the same input in different batch sizes.
 
         Args:
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
-        embeddings = LlamaCppEmbeddings(model_path=local_llamacpp_model_path)
-        embeddings.load()
-
         input_text = "This is a test sentence for consistency"
 
         # Generate embedding individually
-        single_result = embeddings.encode([input_text])[0]
+        single_result = self.embeddings.encode([input_text])[0]
 
         # Generate embedding as part of a batch
-        batch_result = embeddings.encode([input_text, "Another sentence"])[0]
+        batch_result = self.embeddings.encode([input_text, "Another sentence"])[0]
 
         # Compare the embeddings
         assert np.allclose(

From 575f48e2909edec9d642601304268d958dc96b8f Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Fri, 4 Oct 2024 18:31:46 +0530
Subject: [PATCH 17/27] - repo_id or model_path : one of the parameters is
 mandatory - model (name of the model) : the model used to generate
 embeddings.

---
 src/distilabel/embeddings/llamacpp.py  | 65 ++++++++++++++------------
 tests/unit/conftest.py                 |  6 +--
 tests/unit/embeddings/test_llamacpp.py | 19 ++++----
 3 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 4bf12ed2ed..b264473590 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from pydantic import Field, PrivateAttr
@@ -28,6 +29,8 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     """`LlamaCpp` library implementation for embedding generation.
 
     Attributes:
+        model_name: contains the name of the GGUF quantized model, compatible with the
+            installed version of the `llama.cpp` Python bindings.
         model_path: contains the path to the GGUF quantized model, compatible with the
             installed version of the `llama.cpp` Python bindings.
         repo_id: the Hugging Face Hub repository id.
@@ -63,8 +66,9 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         # command in the terminal, that will download the model to the `Downloads` folder:
         # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
 
-        model_path = "Downloads/all-MiniLM-L6-v2-Q2_K.gguf"
-        embeddings = LlamaCppEmbeddings(model_path=str(Path.home() / model_path))
+        model_path = "Downloads/"
+        model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
+        embeddings = LlamaCppEmbeddings(model_name=model_name,model_path=str(Path.home() / model_path))
 
         embeddings.load()
 
@@ -82,8 +86,8 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         from distilabel.embeddings import LlamaCppEmbeddings
 
         repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
-        model_path = "all-MiniLM-L6-v2-Q5_K_M.gguf"
-        embeddings = LlamaCppEmbeddings(repo_id=repo_id,model_path=model_path)
+        model_name = "all-MiniLM-L6-v2-Q5_K_M.gguf"
+        embeddings = LlamaCppEmbeddings(model_name=model_name,repo_id=repo_id)
 
         embeddings.load()
 
@@ -140,7 +144,12 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
     """
 
-    model_path: str
+    model: str
+
+    model_path: RuntimeParameter[str] = Field(
+        default=None,
+        description="The path to the GGUF quantized model, compatible with the installed version of the `llama.cpp` Python bindings.",
+    )
 
     repo_id: RuntimeParameter[str] = Field(
         default=None, description="The Hugging Face Hub repository id.", exclude=True
@@ -186,33 +195,25 @@ def load(self) -> None:
             ) from ie
 
         if self.repo_id is not None:
-            try:
-                from huggingface_hub.utils import validate_repo_id
-
-                validate_repo_id(self.repo_id)
-            except ImportError as ie:
-                raise ImportError(
-                    "Llama.from_pretrained requires the huggingface-hub package. "
-                    "You can install it with `pip install huggingface-hub`."
-                ) from ie
-            try:
-                self._model = Llama.from_pretrained(
-                    repo_id=self.repo_id,
-                    filename=self.model_path,
-                    n_gpu_layers=self.n_gpu_layers,
-                    seed=self.seed,
-                    n_ctx=self.n_ctx,
-                    n_batch=self.n_batch,
-                    verbose=self.verbose,
-                    embedding=True,
-                    kwargs=self.extra_kwargs,
-                )
-            except Exception:
-                raise
-        else:
+            # use repo_id to download the model
+            from huggingface_hub.utils import validate_repo_id
+
+            validate_repo_id(self.repo_id)
+            self._model = Llama.from_pretrained(
+                repo_id=self.repo_id,
+                filename=self.model,
+                n_gpu_layers=self.n_gpu_layers,
+                seed=self.seed,
+                n_ctx=self.n_ctx,
+                n_batch=self.n_batch,
+                verbose=self.verbose,
+                embedding=True,
+                kwargs=self.extra_kwargs,
+            )
+        elif self.model_path is not None:
             try:
                 self._model = Llama(
-                    model_path=self.model_path,
+                    model_path=str(Path(self.model_path) / self.model),
                     n_gpu_layers=self.n_gpu_layers,
                     seed=self.seed,
                     n_ctx=self.n_ctx,
@@ -223,6 +224,8 @@ def load(self) -> None:
                 )
             except Exception:
                 raise
+        else:
+            raise ValueError("Either 'model_path' or 'repo_id' must be provided")
 
     def unload(self) -> None:
         """Unloads the `gguf` model."""
@@ -232,7 +235,7 @@ def unload(self) -> None:
     @property
     def model_name(self) -> str:
         """Returns the name of the model."""
-        return self.model_path
+        return self.model
 
     def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:
         """Generates embeddings for the provided inputs.
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 3d4d86c875..bf6db90a73 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -153,20 +153,16 @@ def local_llamacpp_model_path(tmp_path_factory):
     model_path = tmp_path / model_name
 
     if not model_path.exists():
-        print(f"Downloading test model to {model_path}...")
         urlretrieve(model_url, model_path)
-        print("Download complete.")
 
     def cleanup():
         if model_path.exists():
-            print(f"Cleaning up downloaded model at {model_path}...")
             os.remove(model_path)
-            print("Cleanup complete.")
 
     # Register the cleanup function to be called at exit
     atexit.register(cleanup)
 
-    return str(model_path)
+    return str(tmp_path)
 
 
 def pytest_addoption(parser):
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 4b426ae3bc..528825933b 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -26,26 +26,27 @@
 
 
 class TestLlamaCppEmbeddings:
-    model_path = "Downloads/all-MiniLM-L6-v2-Q2_K.gguf"
-    repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
-    hub_model = "all-MiniLM-L6-v2-Q5_K_M.gguf"
-
     @pytest.fixture(autouse=True)
     def setup_embeddings(self, local_llamacpp_model_path, use_cpu):
         """
         Fixture to set up embeddings for each test, considering CPU usage.
         """
+        self.model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
+        self.repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
         n_gpu_layers = 0 if use_cpu else -1
         self.embeddings = LlamaCppEmbeddings(
-            model_path=local_llamacpp_model_path, n_gpu_layers=n_gpu_layers
+            model=self.model_name,
+            model_path=local_llamacpp_model_path,
+            n_gpu_layers=n_gpu_layers,
         )
+
         self.embeddings.load()
 
-    def test_model_name(self, local_llamacpp_model_path) -> None:
+    def test_model_name(self) -> None:
         """
         Test if the model name is correctly set.
         """
-        assert self.embeddings.model_name == local_llamacpp_model_path
+        assert self.embeddings.model_name == self.model_name
 
     def test_encode(self) -> None:
         """
@@ -88,8 +89,8 @@ def test_load_model_from_repo(self, use_cpu):
         n_gpu_layers = 0 if use_cpu else -1
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
+            model=self.model_name,
             normalize_embeddings=True,
-            model_path=self.hub_model,
             n_gpu_layers=n_gpu_layers,
         )
         inputs = [
@@ -117,8 +118,8 @@ def test_normalize_embeddings(self, use_cpu):
         n_gpu_layers = 0 if use_cpu else -1
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
+            model=self.model_name,
             normalize_embeddings=True,
-            model_path=self.hub_model,
             n_gpu_layers=n_gpu_layers,
         )
         embeddings.load()

From 48dce7b31b65723566f382e1a5bed05a9bc4c93d Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Fri, 4 Oct 2024 18:39:41 +0530
Subject: [PATCH 18/27] Added description to attribute : model

---
 src/distilabel/embeddings/llamacpp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index b264473590..b1e816c513 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -144,7 +144,9 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
     """
 
-    model: str
+    model: str = Field(
+        description="The name of the model to use for embeddings.",
+    )
 
     model_path: RuntimeParameter[str] = Field(
         default=None,

From 0e1fb8e39b9e9321cb6989794b3cc72eace6809e Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Fri, 4 Oct 2024 18:46:19 +0530
Subject: [PATCH 19/27] - Fixed examples

---
 src/distilabel/embeddings/llamacpp.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index b1e816c513..e7f5bf3a82 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -67,8 +67,8 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
 
         model_path = "Downloads/"
-        model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
-        embeddings = LlamaCppEmbeddings(model_name=model_name,model_path=str(Path.home() / model_path))
+        model = "all-MiniLM-L6-v2-Q2_K.gguf"
+        embeddings = LlamaCppEmbeddings(model=model,model_path=str(Path.home() / model_path))
 
         embeddings.load()
 
@@ -86,8 +86,8 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         from distilabel.embeddings import LlamaCppEmbeddings
 
         repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
-        model_name = "all-MiniLM-L6-v2-Q5_K_M.gguf"
-        embeddings = LlamaCppEmbeddings(model_name=model_name,repo_id=repo_id)
+        model = "all-MiniLM-L6-v2-Q5_K_M.gguf"
+        embeddings = LlamaCppEmbeddings(model=model,repo_id=repo_id)
 
         embeddings.load()
 
@@ -108,8 +108,8 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         os.environ["HF_TOKEN"] = "hf_..."
 
         repo_id = "private_repo_id"
-        model_path = "model"
-        embeddings = LlamaCppEmbeddings(repo_id=repo_id,model_path=model_path)
+        model = "model"
+        embeddings = LlamaCppEmbeddings(repo_id=repo_id,model=model)
 
         embeddings.load()
 
@@ -129,8 +129,9 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         # command in the terminal, that will download the model to the `Downloads` folder:
         # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
 
-        model_path = "Downloads/all-MiniLM-L6-v2-Q2_K.gguf"
-        embeddings = LlamaCppEmbeddings(model_path=str(Path.home() / model_path), n_gpu_layers=0)
+        model_path = "Downloads/"
+        model = "all-MiniLM-L6-v2-Q2_K.gguf"
+        embeddings = LlamaCppEmbeddings(model=model,model_path=str(Path.home() / model_path), n_gpu_layers=0)
 
         embeddings.load()
 

From f72ef3094c4c469a39615e62993a43521d08ed67 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Fri, 4 Oct 2024 19:02:52 +0530
Subject: [PATCH 20/27] Updated examples

---
 src/distilabel/embeddings/llamacpp.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index e7f5bf3a82..62f4e86038 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -82,7 +82,6 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         Generate sentence embeddings using a HuggingFace Hub public model:
 
         ```python
-        from pathlib import Path
         from distilabel.embeddings import LlamaCppEmbeddings
 
         repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
@@ -101,7 +100,6 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         Generate sentence embeddings using a HuggingFace Hub private model:
 
         ```python
-        from pathlib import Path
         from distilabel.embeddings import LlamaCppEmbeddings
 
         # You need to set environment variable to download private model to the local machine
@@ -109,7 +107,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
         repo_id = "private_repo_id"
         model = "model"
-        embeddings = LlamaCppEmbeddings(repo_id=repo_id,model=model)
+        embeddings = LlamaCppEmbeddings(model=model,repo_id=repo_id)
 
         embeddings.load()
 
@@ -119,6 +117,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         #   [4.4889533455716446e-05, 0.044016145169734955, ...],
         # ]
         ```
+
         Generate sentence embeddings with cpu:
 
         ```python

From 8218242977e506d21d90ab311652eefb10ecbb15 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Mon, 14 Oct 2024 17:19:05 +0530
Subject: [PATCH 21/27] Update src/distilabel/embeddings/llamacpp.py

try except block is not needed.

Co-authored-by: David Berenstein <david.m.berenstein@gmail.com>
---
 src/distilabel/embeddings/llamacpp.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 62f4e86038..b9735f465e 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -213,19 +213,16 @@ def load(self) -> None:
                 kwargs=self.extra_kwargs,
             )
         elif self.model_path is not None:
-            try:
-                self._model = Llama(
-                    model_path=str(Path(self.model_path) / self.model),
-                    n_gpu_layers=self.n_gpu_layers,
-                    seed=self.seed,
-                    n_ctx=self.n_ctx,
-                    n_batch=self.n_batch,
-                    verbose=self.verbose,
-                    embedding=True,
-                    kwargs=self.extra_kwargs,
-                )
-            except Exception:
-                raise
+            self._model = Llama(
+                model_path=str(Path(self.model_path) / self.model),
+                n_gpu_layers=self.n_gpu_layers,
+                seed=self.seed,
+                n_ctx=self.n_ctx,
+                n_batch=self.n_batch,
+                verbose=self.verbose,
+                embedding=True,
+                kwargs=self.extra_kwargs,
+            )
         else:
             raise ValueError("Either 'model_path' or 'repo_id' must be provided")
 

From db004825339191b318e2ce6e0e0a919bc8438ed0 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Mon, 14 Oct 2024 17:19:41 +0530
Subject: [PATCH 22/27] Update src/distilabel/embeddings/llamacpp.py

Co-authored-by: David Berenstein <david.m.berenstein@gmail.com>
---
 src/distilabel/embeddings/llamacpp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index b9735f465e..794a1d2164 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -52,6 +52,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         - `normalize_embeddings`: whether to normalize the embeddings. Defaults to `False`.
         - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to the
             `Llama` class of `llama_cpp` library. Defaults to `{}`.
+
     References:
         - [Offline inference embeddings](https://llama-cpp-python.readthedocs.io/en/stable/#embeddings)
 

From 0fb7f15f2c8e4d99a68cc61557e84c74aaef6d9f Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Mon, 14 Oct 2024 17:20:20 +0530
Subject: [PATCH 23/27] Update src/distilabel/embeddings/llamacpp.py

hidden attributes shouldn't be documented.

Co-authored-by: David Berenstein <david.m.berenstein@gmail.com>
---
 src/distilabel/embeddings/llamacpp.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 794a1d2164..41f41e86c3 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -43,8 +43,6 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         n_batch: Prompt processing maximum batch size
         extra_kwargs: additional dictionary of keyword arguments that will be passed to the
             `Llama` class of `llama_cpp` library. Defaults to `{}`.
-        _model: the `Llama` model instance. This attribute is meant to be used internally
-            and should not be accessed directly. It will be set in the `load` method.
 
     Runtime parameters:
         - `n_gpu_layers`: the number of layers to use for the GPU. Defaults to `-1`.

From 155feb280f87992e5d9e13bd69aaf40b91366b6f Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Mon, 14 Oct 2024 18:01:54 +0530
Subject: [PATCH 24/27] Updated test to set disable_cuda_device_placement=True
 when testing for cpu

---
 .gitignore                             |  3 --
 src/distilabel/embeddings/llamacpp.py  |  1 -
 tests/unit/embeddings/test_llamacpp.py | 69 +++++++++++---------------
 3 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/.gitignore b/.gitignore
index 93707388c7..d8337200af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,6 +77,3 @@ venv.bak/
 # Other
 *.log
 *.swp
-.DS_Store
-#models
-tests/model
diff --git a/src/distilabel/embeddings/llamacpp.py b/src/distilabel/embeddings/llamacpp.py
index 41f41e86c3..11382ac91e 100644
--- a/src/distilabel/embeddings/llamacpp.py
+++ b/src/distilabel/embeddings/llamacpp.py
@@ -184,7 +184,6 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
     def load(self) -> None:
         """Loads the `gguf` model using either the path or the Hugging Face Hub repository id."""
         super().load()
-        self.disable_cuda_device_placement = True
         CudaDevicePlacementMixin.load(self)
 
         try:
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 528825933b..8f0a97d8b1 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -33,36 +33,47 @@ def setup_embeddings(self, local_llamacpp_model_path, use_cpu):
         """
         self.model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
         self.repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
+        self.disable_cuda_device_placement = True
         n_gpu_layers = 0 if use_cpu else -1
         self.embeddings = LlamaCppEmbeddings(
             model=self.model_name,
             model_path=local_llamacpp_model_path,
             n_gpu_layers=n_gpu_layers,
+            disable_cuda_device_placement=self.disable_cuda_device_placement,
         )
 
         self.embeddings.load()
 
+    @pytest.fixture
+    def test_inputs(self):
+        """
+        Fixture that provides a list of test input strings.
+
+        Returns:
+            list: A list of strings to be used as test inputs for embeddings.
+        """
+        return [
+            "Hello, how are you?",
+            "What a nice day!",
+            "I hear that llamas are very popular now.",
+        ]
+
     def test_model_name(self) -> None:
         """
         Test if the model name is correctly set.
         """
         assert self.embeddings.model_name == self.model_name
 
-    def test_encode(self) -> None:
+    def test_encode(self, test_inputs) -> None:
         """
         Test if the model can generate embeddings.
         """
-        inputs = [
-            "Hello, how are you?",
-            "What a nice day!",
-            "I hear that llamas are very popular now.",
-        ]
-        results = self.embeddings.encode(inputs=inputs)
+        results = self.embeddings.encode(inputs=test_inputs)
 
         for result in results:
             assert len(result) == 384
 
-    def test_load_model_from_local(self):
+    def test_load_model_from_local(self, test_inputs):
         """
         Test if the model can be loaded from a local file and generate embeddings.
 
@@ -70,19 +81,12 @@ def test_load_model_from_local(self):
             local_llamacpp_model_path (str): Fixture providing the local model path.
         """
 
-        inputs = [
-            "Hello, how are you?",
-            "What a nice day!",
-            "I hear that llamas are very popular now.",
-        ]
-
-        # Test if the model is loaded by generating an embedding
-        results = self.embeddings.encode(inputs=inputs)
+        results = self.embeddings.encode(inputs=test_inputs)
 
         for result in results:
             assert len(result) == 384
 
-    def test_load_model_from_repo(self, use_cpu):
+    def test_load_model_from_repo(self, use_cpu, test_inputs):
         """
         Test if the model can be loaded from a Hugging Face repository.
         """
@@ -92,38 +96,29 @@ def test_load_model_from_repo(self, use_cpu):
             model=self.model_name,
             normalize_embeddings=True,
             n_gpu_layers=n_gpu_layers,
+            disable_cuda_device_placement=self.disable_cuda_device_placement,
         )
-        inputs = [
-            "Hello, how are you?",
-            "What a nice day!",
-            "I hear that llamas are very popular now.",
-        ]
-
         embeddings.load()
-        # Test if the model is loaded by generating an embedding
-        results = embeddings.encode(inputs=inputs)
+        results = embeddings.encode(inputs=test_inputs)
 
         for result in results:
             assert len(result) == 384
 
-    def test_normalize_embeddings(self, use_cpu):
+    def test_normalize_embeddings(self, use_cpu, test_inputs):
         """
         Test if embeddings are normalized when normalize_embeddings is True.
         """
-        inputs = [
-            "Hello, how are you?",
-            "What a nice day!",
-            "I hear that llamas are very popular now.",
-        ]
+
         n_gpu_layers = 0 if use_cpu else -1
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
             model=self.model_name,
             normalize_embeddings=True,
             n_gpu_layers=n_gpu_layers,
+            disable_cuda_device_placement=self.disable_cuda_device_placement,
         )
         embeddings.load()
-        results = embeddings.encode(inputs=inputs)
+        results = embeddings.encode(inputs=test_inputs)
 
         for result in results:
             # Check if the embedding is normalized (L2 norm should be close to 1)
@@ -132,18 +127,12 @@ def test_normalize_embeddings(self, use_cpu):
                 norm, 1.0, atol=1e-6
             ), f"Norm is {norm}, expected close to 1.0"
 
-    def test_normalize_embeddings_false(self):
+    def test_normalize_embeddings_false(self, test_inputs):
         """
         Test if embeddings are not normalized when normalize_embeddings is False.
         """
 
-        inputs = [
-            "Hello, how are you?",
-            "What a nice day!",
-            "I hear that llamas are very popular now.",
-        ]
-
-        results = self.embeddings.encode(inputs=inputs)
+        results = self.embeddings.encode(inputs=test_inputs)
 
         for result in results:
             # Check if the embedding is not normalized (L2 norm should not be close to 1)

From 365940093162501e2a3c041c58d22d565aa64e3e Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Wed, 16 Oct 2024 18:16:01 +0530
Subject: [PATCH 25/27] testcase will by default load the model to cpu

---
 tests/unit/conftest.py                 | 45 +-------------------------
 tests/unit/embeddings/test_llamacpp.py | 22 ++++---------
 2 files changed, 8 insertions(+), 59 deletions(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index bf6db90a73..7e8fe74df5 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -112,41 +112,15 @@ def local_llamacpp_model_path(tmp_path_factory):
     """
     Session-scoped fixture that provides the local model path for LlamaCpp testing.
 
-    The model path can be set using the LLAMACPP_TEST_MODEL_PATH environment variable.
-    If not set, it downloads a small test model to a temporary directory.
+    Download a small test model to a temporary directory.
     The model is downloaded once per test session and cleaned up after all tests.
 
-    To use a custom model:
-    1. Set the LLAMACPP_TEST_MODEL_PATH environment variable to the path of your model file.
-    2. Ensure the model file exists at the specified path.
-
-    Example:
-        export LLAMACPP_TEST_MODEL_PATH="/path/to/your/model.gguf"
-
     Args:
         tmp_path_factory: Pytest fixture providing a temporary directory factory.
 
     Returns:
         str: The path to the local LlamaCpp model file.
     """
-    print("\nLlamaCpp model path information:")
-
-    # Check for environment variable first
-    env_path = os.environ.get("LLAMACPP_TEST_MODEL_PATH")
-    if env_path:
-        print(f"Using custom model path from LLAMACPP_TEST_MODEL_PATH: {env_path}")
-        if not os.path.exists(env_path):
-            raise FileNotFoundError(
-                f"Custom model file not found at {env_path}. Please ensure the file exists."
-            )
-        return env_path
-
-    print("LLAMACPP_TEST_MODEL_PATH not set. Using default test model.")
-    print(
-        "To use a custom model, set the LLAMACPP_TEST_MODEL_PATH environment variable to the path of your model file."
-    )
-
-    # If env var not set, use a small test model
     model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
     model_url = f"https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/{model_name}"
     tmp_path = tmp_path_factory.getbasetemp()
@@ -163,20 +137,3 @@ def cleanup():
     atexit.register(cleanup)
 
     return str(tmp_path)
-
-
-def pytest_addoption(parser):
-    """
-    Add a command-line option to pytest for CPU-only testing.
-    """
-    parser.addoption(
-        "--cpu-only", action="store", default=False, help="Run tests on CPU only"
-    )
-
-
-@pytest.fixture
-def use_cpu(request):
-    """
-    Fixture to determine whether to use CPU based on command-line option.
-    """
-    return request.config.getoption("--cpu-only")
diff --git a/tests/unit/embeddings/test_llamacpp.py b/tests/unit/embeddings/test_llamacpp.py
index 8f0a97d8b1..d7d8b55fbb 100644
--- a/tests/unit/embeddings/test_llamacpp.py
+++ b/tests/unit/embeddings/test_llamacpp.py
@@ -18,27 +18,21 @@
 
 from distilabel.embeddings import LlamaCppEmbeddings
 
-"""
-To test with CPU only, run the following command:
-pytest tests/unit/embeddings/test_llamacpp.py --cpu-only
-
-"""
-
 
 class TestLlamaCppEmbeddings:
     @pytest.fixture(autouse=True)
-    def setup_embeddings(self, local_llamacpp_model_path, use_cpu):
+    def setup_embeddings(self, local_llamacpp_model_path):
         """
         Fixture to set up embeddings for each test, considering CPU usage.
         """
         self.model_name = "all-MiniLM-L6-v2-Q2_K.gguf"
         self.repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
         self.disable_cuda_device_placement = True
-        n_gpu_layers = 0 if use_cpu else -1
+        self.n_gpu_layers = 0
         self.embeddings = LlamaCppEmbeddings(
             model=self.model_name,
             model_path=local_llamacpp_model_path,
-            n_gpu_layers=n_gpu_layers,
+            n_gpu_layers=self.n_gpu_layers,
             disable_cuda_device_placement=self.disable_cuda_device_placement,
         )
 
@@ -86,16 +80,15 @@ def test_load_model_from_local(self, test_inputs):
         for result in results:
             assert len(result) == 384
 
-    def test_load_model_from_repo(self, use_cpu, test_inputs):
+    def test_load_model_from_repo(self, test_inputs):
         """
         Test if the model can be loaded from a Hugging Face repository.
         """
-        n_gpu_layers = 0 if use_cpu else -1
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
             model=self.model_name,
             normalize_embeddings=True,
-            n_gpu_layers=n_gpu_layers,
+            n_gpu_layers=self.n_gpu_layers,
             disable_cuda_device_placement=self.disable_cuda_device_placement,
         )
         embeddings.load()
@@ -104,17 +97,16 @@ def test_load_model_from_repo(self, use_cpu, test_inputs):
         for result in results:
             assert len(result) == 384
 
-    def test_normalize_embeddings(self, use_cpu, test_inputs):
+    def test_normalize_embeddings(self, test_inputs):
         """
         Test if embeddings are normalized when normalize_embeddings is True.
         """
 
-        n_gpu_layers = 0 if use_cpu else -1
         embeddings = LlamaCppEmbeddings(
             repo_id=self.repo_id,
             model=self.model_name,
             normalize_embeddings=True,
-            n_gpu_layers=n_gpu_layers,
+            n_gpu_layers=self.n_gpu_layers,
             disable_cuda_device_placement=self.disable_cuda_device_placement,
         )
         embeddings.load()

From da92cc955b78e5d54102b9c6874b59c8b5a5ac8a Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Sat, 26 Oct 2024 07:55:00 +0530
Subject: [PATCH 26/27] example code updated

---
 src/distilabel/models/embeddings/llamacpp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/distilabel/models/embeddings/llamacpp.py b/src/distilabel/models/embeddings/llamacpp.py
index aa6d945b56..a72f989d11 100644
--- a/src/distilabel/models/embeddings/llamacpp.py
+++ b/src/distilabel/models/embeddings/llamacpp.py
@@ -59,7 +59,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
         ```python
         from pathlib import Path
-        from distilabel.embeddings import LlamaCppEmbeddings
+        from distilabel.models.embeddings import LlamaCppEmbeddings
 
         # You can follow along this example downloading the following model running the following
         # command in the terminal, that will download the model to the `Downloads` folder:
@@ -81,7 +81,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         Generate sentence embeddings using a HuggingFace Hub public model:
 
         ```python
-        from distilabel.embeddings import LlamaCppEmbeddings
+        from distilabel.models.embeddings import LlamaCppEmbeddings
 
         repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
         model = "all-MiniLM-L6-v2-Q5_K_M.gguf"
@@ -99,7 +99,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
         Generate sentence embeddings using a HuggingFace Hub private model:
 
         ```python
-        from distilabel.embeddings import LlamaCppEmbeddings
+        from distilabel.models.embeddings import LlamaCppEmbeddings
 
         # You need to set environment variable to download private model to the local machine
         os.environ["HF_TOKEN"] = "hf_..."
@@ -121,7 +121,7 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
         ```python
         from pathlib import Path
-        from distilabel.embeddings import LlamaCppEmbeddings
+        from distilabel.models.embeddings import LlamaCppEmbeddings
 
         # You can follow along this example downloading the following model running the following
         # command in the terminal, that will download the model to the `Downloads` folder:

From 09dd551491d580941cf8e26483c7230da9c25834 Mon Sep 17 00:00:00 2001
From: bikash119 <bikashjpatra@gmail.com>
Date: Sat, 26 Oct 2024 09:57:42 +0530
Subject: [PATCH 27/27] examples fixed

---
 src/distilabel/models/embeddings/llamacpp.py | 53 ++++++++------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/src/distilabel/models/embeddings/llamacpp.py b/src/distilabel/models/embeddings/llamacpp.py
index a72f989d11..6596bb45ea 100644
--- a/src/distilabel/models/embeddings/llamacpp.py
+++ b/src/distilabel/models/embeddings/llamacpp.py
@@ -63,54 +63,37 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
         # You can follow along this example downloading the following model running the following
         # command in the terminal, that will download the model to the `Downloads` folder:
-        # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
+        # curl -L -o ~/Downloads/all-MiniLM-L6-v2-Q2_K.gguf https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-Q2_K.gguf
 
         model_path = "Downloads/"
         model = "all-MiniLM-L6-v2-Q2_K.gguf"
-        embeddings = LlamaCppEmbeddings(model=model,model_path=str(Path.home() / model_path))
+        embeddings = LlamaCppEmbeddings(
+            model=model,
+            model_path=str(Path.home() / model_path),
+        )
 
         embeddings.load()
 
         results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
-        # [
-        #   [-0.05447685346007347, -0.01623094454407692, ...],
-        #   [4.4889533455716446e-05, 0.044016145169734955, ...],
-        # ]
+        print(results)
+        embeddings.unload()
         ```
 
-        Generate sentence embeddings using a HuggingFace Hub public model:
+        Generate sentence embeddings using a HuggingFace Hub model:
 
         ```python
         from distilabel.models.embeddings import LlamaCppEmbeddings
-
-        repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
-        model = "all-MiniLM-L6-v2-Q5_K_M.gguf"
-        embeddings = LlamaCppEmbeddings(model=model,repo_id=repo_id)
-
-        embeddings.load()
-
-        results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
-        # [
-        #   [-0.05447685346007347, -0.01623094454407692, ...],
-        #   [4.4889533455716446e-05, 0.044016145169734955, ...],
-        # ]
-        ```
-
-        Generate sentence embeddings using a HuggingFace Hub private model:
-
-        ```python
-        from distilabel.models.embeddings import LlamaCppEmbeddings
-
         # You need to set environment variable to download private model to the local machine
-        os.environ["HF_TOKEN"] = "hf_..."
 
-        repo_id = "private_repo_id"
-        model = "model"
+        repo_id = "second-state/All-MiniLM-L6-v2-Embedding-GGUF"
+        model = "all-MiniLM-L6-v2-Q2_K.gguf"
         embeddings = LlamaCppEmbeddings(model=model,repo_id=repo_id)
 
         embeddings.load()
 
         results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
+        print(results)
+        embeddings.unload()
         # [
         #   [-0.05447685346007347, -0.01623094454407692, ...],
         #   [4.4889533455716446e-05, 0.044016145169734955, ...],
@@ -125,15 +108,22 @@ class LlamaCppEmbeddings(Embeddings, CudaDevicePlacementMixin):
 
         # You can follow along this example downloading the following model running the following
         # command in the terminal, that will download the model to the `Downloads` folder:
-        # curl -L -o ~/Downloads/All-MiniLM-L6-v2-Embedding-GGUF https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/blob/main/all-MiniLM-L6-v2-Q2_K.gguf
+        # curl -L -o ~/Downloads/all-MiniLM-L6-v2-Q2_K.gguf https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-Q2_K.gguf
 
         model_path = "Downloads/"
         model = "all-MiniLM-L6-v2-Q2_K.gguf"
-        embeddings = LlamaCppEmbeddings(model=model,model_path=str(Path.home() / model_path), n_gpu_layers=0)
+        embeddings = LlamaCppEmbeddings(
+            model=model,
+            model_path=str(Path.home() / model_path),
+            n_gpu_layers=0,
+            disable_cuda_device_placement=True,
+        )
 
         embeddings.load()
 
         results = embeddings.encode(inputs=["distilabel is awesome!", "and Argilla!"])
+        print(results)
+        embeddings.unload()
         # [
         #   [-0.05447685346007347, -0.01623094454407692, ...],
         #   [4.4889533455716446e-05, 0.044016145169734955, ...],
@@ -227,6 +217,7 @@ def load(self) -> None:
     def unload(self) -> None:
         """Unloads the `gguf` model."""
         CudaDevicePlacementMixin.unload(self)
+        self._model.close()
         super().unload()
 
     @property