Skip to content

Commit

Permalink
Update docs to include info about ProcessLLM and LLMPool (#176)
Browse files Browse the repository at this point in the history
* Update `LLM` concept guide

* Update `tasks` technical-reference

* Mention `Future` as output if thread pool

* Exclude docs python snippets

* Add section about using `LLMPool` with `Pipeline`

* Remove header from docs snippets

* Finish `LLMPool` guide

* Update `mkdocs-material` dep

* Remove line

* Add preference-dataset example

* Add section about `ProcessLLM` and `LLMPool`

* Remove `.git`
  • Loading branch information
gabrielmbmb authored Dec 21, 2023
1 parent 00de1f1 commit 7646510
Show file tree
Hide file tree
Showing 30 changed files with 545 additions and 218 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ repos:
- id: insert-license
name: "Insert license header in Python source files"
files: \.py$
exclude: ^docs/snippets/
args:
- --license-filepath
- license_header.txt
Expand Down
14 changes: 14 additions & 0 deletions docs/snippets/guides/self-instruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from distilabel.llm import OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.tasks import SelfInstructTask

self_instruct = SelfInstructTask(
application_description="An AI application to generate tables in markdown format.",
num_instructions=5,
)

generator = OpenAILLM(task=self_instruct)

pipeline = Pipeline(generator=generator)

dataset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2)
4 changes: 2 additions & 2 deletions docs/snippets/technical-reference/llm/llamacpp_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
prompt_format="notus",
)

result_llamacpp = llm.generate([{"input": "What is the capital of Spain?"}])
# >>> print(result_llamacpp[0][0]["parsed_output"]["generations"])
result = llm.generate([{"input": "What is the capital of Spain?"}])
# >>> print(result[0][0]["parsed_output"]["generations"])
# The capital of Spain is Madrid. It is located in the center of the country and
# is known for its vibrant culture, beautiful architecture, and delicious food.
# Madrid is home to many famous landmarks such as the Prado Museum, Retiro Park,
Expand Down
39 changes: 39 additions & 0 deletions docs/snippets/technical-reference/llm/llmpool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from distilabel.tasks import TextGenerationTask, Task
from distilabel.llm import ProcessLLM, LLM, LLMPool

def load_gpt_3(task: Task) -> LLM:
from distilabel.llm import OpenAILLM

return OpenAILLM(
model="gpt-3.5-turbo",
task=task,
num_threads=4,
)

def load_gpt_4(task: Task) -> LLM:
from distilabel.llm import OpenAILLM

return OpenAILLM(
model="gpt-4",
task=task,
num_threads=4,
)


pool = LLMPool(llms=[
ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_gpt_3),
ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_gpt_4),
])
result = pool.generate(
inputs=[{"input": "Write a letter for Bob"}], num_generations=2
)
pool.teardown()
# >>> print(result[0][0]["parsed_output"]["generations"], end="\n\n\n\n\n\n---->")
# Dear Bob,
# I hope this letter finds you in good health and high spirits. I know it's been a while since we last caught up, and I wanted to take the time to connect and share a few updates.
# Life has been keeping me pretty busy lately. [Provide a brief overview of what you've been up to: work, school, family, hobbies, etc.]
# I've often found myself reminiscing about the good old days, like when we [include a memorable moment or shared experience with Bob].
# >>> print(result[0][1]["parsed_output"]["generations"])
# Of course, I'd be happy to draft a sample letter for you. However, I would need some additional
# information including who "Bob" is, the subject matter of the letter, the tone (formal or informal),
# and any specific details or points you'd like to include. Please provide some more context and I'll do my best to assist you.
11 changes: 5 additions & 6 deletions docs/snippets/technical-reference/llm/openai_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
model="gpt-3.5-turbo",
task=OpenAITextGenerationTask(),
max_new_tokens=256,
num_threads=2,
openai_api_key=os.environ.get("OPENAI_API_KEY"),
temperature=0.3,
)
result_openai = openaillm.generate([{"input": "What is OpenAI?"}])
# >>> result_openai
# [<Future at 0x2970ea560 state=running>]
# >>> result_openai[0].result()[0][0]["parsed_output"]["generations"]
# 'OpenAI is an artificial intelligence research organization that aims to ensure that artificial general intelligence (AGI) benefits all of humanity. AGI refers to highly autonomous systems that outperform humans at most economically valuable work. OpenAI conducts research, develops AI technologies, and promotes the responsible and safe use of AI. They also work on projects to make AI more accessible and beneficial to society. OpenAI is committed to transparency, cooperation, and avoiding uses of AI that could harm humanity or concentrate power in the wrong hands.'
result = openaillm.generate([{"input": "What is OpenAI?"}])
# >>> print(result[0][0]["parsed_output"]["generations"])
# OpenAI is an artificial intelligence research laboratory and company. It was founded
# with the goal of ensuring that artificial general intelligence (AGI) benefits all of
# humanity. OpenAI conducts cutting-edge research in various fields of AI ...
25 changes: 25 additions & 0 deletions docs/snippets/technical-reference/llm/processllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from distilabel.tasks import TextGenerationTask, Task
from distilabel.llm import ProcessLLM, LLM


def load_gpt_4(task: Task) -> LLM:
from distilabel.llm import OpenAILLM

return OpenAILLM(
model="gpt-4",
task=task,
num_threads=4,
)


llm = ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_gpt_4)
future = llm.generate(
inputs=[{"input": "Write a letter for Bob"}], num_generations=1
) # (1)
llm.teardown() # (2)
result = future.result()
# >>> print(result[0][0]["parsed_output"]["generations"])
# Dear Bob,
# I hope this letter finds you in good health and high spirits. I know it's been a while since we last caught up, and I wanted to take the time to connect and share a few updates.
# Life has been keeping me pretty busy lately. [Provide a brief overview of what you've been up to: work, school, family, hobbies, etc.]
# I've often found myself reminiscing about the good old days, like when we [include a memorable moment or shared experience with Bob].
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# Load the models from huggingface hub:
tokenizer = AutoTokenizer.from_pretrained("argilla/notus-7b-v1")
model = AutoModelForCausalLM.from_pretrained("argilla/notus-7b-v1")
model = AutoModelForCausalLM.from_pretrained("argilla/notus-7b-v1", device_map="auto")

# Instantiate our LLM with them:
llm = TransformersLLM(
Expand All @@ -15,3 +15,9 @@
temperature=0.3,
prompt_format="notus",
)

result = llm.generate([{"input": "What's a large language model?"}])
# >>> print(result[0][0]["parsed_output"]["generations"])
# A large language model is a type of machine learning algorithm that is designed to analyze
# and understand large amounts of text data. It is called "large" because it requires a
# vast amount of data to train and improve its accuracy. These models are ...
17 changes: 17 additions & 0 deletions docs/snippets/technical-reference/llm/vllm_generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from distilabel.tasks import TextGenerationTask
from distilabel.llm import vLLM
from vllm import LLM

llm = vLLM(
vllm=LLM(model="argilla/notus-7b-v1"),
task=TextGenerationTask(),
max_new_tokens=512,
temperature=0.3,
prompt_format="notus",
)
result_vllm = llm.generate([{"input": "What's a large language model?"}])
# >>> print(result[0][0]["parsed_output"]["generations"])
# A large language model is a type of artificial intelligence (AI) system that is designed
# to understand and interpret human language. It is called "large" because it uses a vast
# amount of data, typically billions of words or more, to learn and make predictions about
# language. Large language models are ...
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/argilla.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argilla as rg

rg.init(api_key="<YOUR_ARGILLA_API_KEY>", api_url="<YOUR_ARGILLA_API_URL>")
Expand Down
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/pipe_1.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from distilabel.llm import InferenceEndpointsLLM
Expand Down
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/pipe_2.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datasets import load_dataset

instruction_dataset = (
Expand Down
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/pipe_3.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

print(pipe_dataset["input"][-1])
# Create a 3 turn conversation between a customer and a grocery store clerk - that is, 3 per person. Then tell me what they talked about.

Expand Down
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/pipeline_generator_2.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datasets import Dataset

dataset = Dataset.from_dict(
Expand Down
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/pipeline_generator_3.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

print(dataset_generated)
# Dataset({
# features: ['input', 'generation_model', 'generation_prompt', 'raw_generation_responses', 'generations'],
Expand Down
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/pipeline_labeller_2.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datasets import Dataset

dataset_test = Dataset.from_dict(
Expand Down
14 changes: 0 additions & 14 deletions docs/snippets/technical-reference/pipeline/pipeline_labeller_3.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

ds_labelled.select_columns(["input", "generations", "rating", "rationale"])[0]
# {
# "input": "Describe the capital of Spain in 25 words.",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from distilabel.llm import InferenceEndpointsLLM, OpenAILLM
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datasets import Dataset

xkcd_instructions = Dataset.from_dict(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

print(ds_xkcd[1]["generations"][0])
print("-----" * 5)
print("RATING: ", ds_xkcd[1]["rating"][0])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from distilabel.llm import LLM, ProcessLLM
from distilabel.tasks import Task, TextGenerationTask


def load_notus(task: Task) -> LLM: # (1)
import os
from distilabel.llm import vLLM
from vllm import LLM

os.environ["CUDA_VISIBLE_DEVICES"] = "0" # (2)

return vLLM(
vllm=LLM(model="argilla/notus-7b-v1"),
task=task,
max_new_tokens=512,
temperature=0.7,
prompt_format="notus",
)


llm = ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_notus)
Loading

0 comments on commit 7646510

Please sign in to comment.