Update docs to include info about ProcessLLM and LLMPool (#176)

* Update `LLM` concept guide * Update `tasks` technical-reference * Mention `Future` as output if thread pool * Exclude docs python snippets * Add section about using `LLMPool` with `Pipeline` * Remove header from docs snippets * Finish `LLMPool` guide * Update `mkdocs-material` dep * Remove line * Add preference-dataset example * Add section about `ProcessLLM` and `LLMPool` * Remove `.git`
argilla-io · Dec 21, 2023 · 7646510 · 7646510
1 parent 00de1f1
commit 7646510
Show file tree

Hide file tree

Showing 30 changed files with 545 additions and 218 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,6 +5,7 @@ repos:
       - id: insert-license
         name: "Insert license header in Python source files"
         files: \.py$
+        exclude: ^docs/snippets/
         args:
           - --license-filepath
           - license_header.txt

diff --git a/docs/snippets/guides/self-instruct.py b/docs/snippets/guides/self-instruct.py
@@ -0,0 +1,14 @@
+from distilabel.llm import OpenAILLM
+from distilabel.pipeline import Pipeline
+from distilabel.tasks import SelfInstructTask
+
+self_instruct = SelfInstructTask(
+    application_description="An AI application to generate tables in markdown format.",
+    num_instructions=5,
+)
+
+generator = OpenAILLM(task=self_instruct)
+
+pipeline = Pipeline(generator=generator)
+
+dataset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2)
diff --git a/docs/snippets/technical-reference/llm/llamacpp_generate.py b/docs/snippets/technical-reference/llm/llamacpp_generate.py
@@ -11,8 +11,8 @@
     prompt_format="notus",
 )
 
-result_llamacpp = llm.generate([{"input": "What is the capital of Spain?"}])
-# >>> print(result_llamacpp[0][0]["parsed_output"]["generations"])
+result = llm.generate([{"input": "What is the capital of Spain?"}])
+# >>> print(result[0][0]["parsed_output"]["generations"])
 # The capital of Spain is Madrid. It is located in the center of the country and
 # is known for its vibrant culture, beautiful architecture, and delicious food.
 # Madrid is home to many famous landmarks such as the Prado Museum, Retiro Park,

diff --git a/docs/snippets/technical-reference/llm/llmpool.py b/docs/snippets/technical-reference/llm/llmpool.py
@@ -0,0 +1,39 @@
+from distilabel.tasks import TextGenerationTask, Task
+from distilabel.llm import ProcessLLM, LLM, LLMPool
+
+def load_gpt_3(task: Task) -> LLM:
+    from distilabel.llm import OpenAILLM
+
+    return OpenAILLM(
+        model="gpt-3.5-turbo",
+        task=task,
+        num_threads=4,
+    )
+
+def load_gpt_4(task: Task) -> LLM:
+    from distilabel.llm import OpenAILLM
+
+    return OpenAILLM(
+        model="gpt-4",
+        task=task,
+        num_threads=4,
+    )
+
+
+pool = LLMPool(llms=[
+    ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_gpt_3),
+    ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_gpt_4),
+])
+result = pool.generate(
+    inputs=[{"input": "Write a letter for Bob"}], num_generations=2
+)
+pool.teardown()
+# >>> print(result[0][0]["parsed_output"]["generations"], end="\n\n\n\n\n\n---->")
+# Dear Bob,
+# I hope this letter finds you in good health and high spirits. I know it's been a while since we last caught up, and I wanted to take the time to connect and share a few updates.
+# Life has been keeping me pretty busy lately. [Provide a brief overview of what you've been up to: work, school, family, hobbies, etc.]
+# I've often found myself reminiscing about the good old days, like when we [include a memorable moment or shared experience with Bob].
+# >>> print(result[0][1]["parsed_output"]["generations"])
+# Of course, I'd be happy to draft a sample letter for you. However, I would need some additional 
+# information including who "Bob" is, the subject matter of the letter, the tone (formal or informal), 
+# and any specific details or points you'd like to include. Please provide some more context and I'll do my best to assist you.
diff --git a/docs/snippets/technical-reference/llm/openai_generate.py b/docs/snippets/technical-reference/llm/openai_generate.py
@@ -7,12 +7,11 @@
     model="gpt-3.5-turbo",
     task=OpenAITextGenerationTask(),
     max_new_tokens=256,
-    num_threads=2,
     openai_api_key=os.environ.get("OPENAI_API_KEY"),
     temperature=0.3,
 )
-result_openai = openaillm.generate([{"input": "What is OpenAI?"}])
-# >>> result_openai
-# [<Future at 0x2970ea560 state=running>]
-# >>> result_openai[0].result()[0][0]["parsed_output"]["generations"]
-# 'OpenAI is an artificial intelligence research organization that aims to ensure that artificial general intelligence (AGI) benefits all of humanity. AGI refers to highly autonomous systems that outperform humans at most economically valuable work. OpenAI conducts research, develops AI technologies, and promotes the responsible and safe use of AI. They also work on projects to make AI more accessible and beneficial to society. OpenAI is committed to transparency, cooperation, and avoiding uses of AI that could harm humanity or concentrate power in the wrong hands.'
+result = openaillm.generate([{"input": "What is OpenAI?"}])
+# >>> print(result[0][0]["parsed_output"]["generations"])
+# OpenAI is an artificial intelligence research laboratory and company. It was founded
+# with the goal of ensuring that artificial general intelligence (AGI) benefits all of
+# humanity. OpenAI conducts cutting-edge research in various fields of AI ...
diff --git a/docs/snippets/technical-reference/llm/processllm.py b/docs/snippets/technical-reference/llm/processllm.py
@@ -0,0 +1,25 @@
+from distilabel.tasks import TextGenerationTask, Task
+from distilabel.llm import ProcessLLM, LLM
+
+
+def load_gpt_4(task: Task) -> LLM:
+    from distilabel.llm import OpenAILLM
+
+    return OpenAILLM(
+        model="gpt-4",
+        task=task,
+        num_threads=4,
+    )
+
+
+llm = ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_gpt_4)
+future = llm.generate(
+    inputs=[{"input": "Write a letter for Bob"}], num_generations=1
+)  # (1)
+llm.teardown()  # (2)
+result = future.result()
+# >>> print(result[0][0]["parsed_output"]["generations"])
+# Dear Bob,
+# I hope this letter finds you in good health and high spirits. I know it's been a while since we last caught up, and I wanted to take the time to connect and share a few updates.
+# Life has been keeping me pretty busy lately. [Provide a brief overview of what you've been up to: work, school, family, hobbies, etc.]
+# I've often found myself reminiscing about the good old days, like when we [include a memorable moment or shared experience with Bob].
diff --git a/docs/snippets/technical-reference/llm/transformers_generate.py b/docs/snippets/technical-reference/llm/transformers_generate.py
@@ -4,7 +4,7 @@
 
 # Load the models from huggingface hub:
 tokenizer = AutoTokenizer.from_pretrained("argilla/notus-7b-v1")
-model = AutoModelForCausalLM.from_pretrained("argilla/notus-7b-v1")
+model = AutoModelForCausalLM.from_pretrained("argilla/notus-7b-v1", device_map="auto")
 
 # Instantiate our LLM with them:
 llm = TransformersLLM(
@@ -15,3 +15,9 @@
     temperature=0.3,
     prompt_format="notus",
 )
+
+result = llm.generate([{"input": "What's a large language model?"}])
+# >>> print(result[0][0]["parsed_output"]["generations"])
+# A large language model is a type of machine learning algorithm that is designed to analyze
+# and understand large amounts of text data. It is called "large" because it requires a
+# vast amount of data to train and improve its accuracy. These models are ...
diff --git a/docs/snippets/technical-reference/llm/vllm_generate.py b/docs/snippets/technical-reference/llm/vllm_generate.py
@@ -0,0 +1,17 @@
+from distilabel.tasks import TextGenerationTask
+from distilabel.llm import vLLM
+from vllm import LLM
+
+llm = vLLM(
+    vllm=LLM(model="argilla/notus-7b-v1"),
+    task=TextGenerationTask(),
+    max_new_tokens=512,
+    temperature=0.3,
+    prompt_format="notus",
+)
+result_vllm = llm.generate([{"input": "What's a large language model?"}])
+# >>> print(result[0][0]["parsed_output"]["generations"])
+# A large language model is a type of artificial intelligence (AI) system that is designed
+# to understand and interpret human language. It is called "large" because it uses a vast
+# amount of data, typically billions of words or more, to learn and make predictions about
+# language. Large language models are ...
diff --git a/docs/snippets/technical-reference/pipeline/argilla.py b/docs/snippets/technical-reference/pipeline/argilla.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import argilla as rg
 
 rg.init(api_key="<YOUR_ARGILLA_API_KEY>", api_url="<YOUR_ARGILLA_API_URL>")

diff --git a/docs/snippets/technical-reference/pipeline/pipe_1.py b/docs/snippets/technical-reference/pipeline/pipe_1.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import os
 
 from distilabel.llm import InferenceEndpointsLLM

diff --git a/docs/snippets/technical-reference/pipeline/pipe_2.py b/docs/snippets/technical-reference/pipeline/pipe_2.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from datasets import load_dataset
 
 instruction_dataset = (

diff --git a/docs/snippets/technical-reference/pipeline/pipe_3.py b/docs/snippets/technical-reference/pipeline/pipe_3.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 print(pipe_dataset["input"][-1])
 # Create a 3 turn conversation between a customer and a grocery store clerk - that is, 3 per person. Then tell me what they talked about.
 

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_generator_2.py b/docs/snippets/technical-reference/pipeline/pipeline_generator_2.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from datasets import Dataset
 
 dataset = Dataset.from_dict(

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_generator_3.py b/docs/snippets/technical-reference/pipeline/pipeline_generator_3.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 print(dataset_generated)
 # Dataset({
 #     features: ['input', 'generation_model', 'generation_prompt', 'raw_generation_responses', 'generations'],

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_labeller_2.py b/docs/snippets/technical-reference/pipeline/pipeline_labeller_2.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from datasets import Dataset
 
 dataset_test = Dataset.from_dict(

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_labeller_3.py b/docs/snippets/technical-reference/pipeline/pipeline_labeller_3.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 ds_labelled.select_columns(["input", "generations", "rating", "rationale"])[0]
 # {
 #     "input": "Describe the capital of Spain in 25 words.",

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_labeller_generator_1.py b/docs/snippets/technical-reference/pipeline/pipeline_labeller_generator_1.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import os
 
 from distilabel.llm import InferenceEndpointsLLM, OpenAILLM

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_labeller_generator_2.py b/docs/snippets/technical-reference/pipeline/pipeline_labeller_generator_2.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from datasets import Dataset
 
 xkcd_instructions = Dataset.from_dict(

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_labeller_generator_3.py b/docs/snippets/technical-reference/pipeline/pipeline_labeller_generator_3.py
@@ -1,17 +1,3 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 print(ds_xkcd[1]["generations"][0])
 print("-----" * 5)
 print("RATING: ", ds_xkcd[1]["rating"][0])

diff --git a/docs/snippets/technical-reference/pipeline/pipeline_llmpool_processllm_1.py b/docs/snippets/technical-reference/pipeline/pipeline_llmpool_processllm_1.py
@@ -0,0 +1,21 @@
+from distilabel.llm import LLM, ProcessLLM
+from distilabel.tasks import Task, TextGenerationTask
+
+
+def load_notus(task: Task) -> LLM:  # (1)
+    import os
+    from distilabel.llm import vLLM
+    from vllm import LLM
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # (2)
+
+    return vLLM(
+        vllm=LLM(model="argilla/notus-7b-v1"),
+        task=task,
+        max_new_tokens=512,
+        temperature=0.7,
+        prompt_format="notus",
+    )
+
+
+llm = ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_notus)