Skip to content

Commit

Permalink
feat: allow specifying dataset size (#23)
Browse files Browse the repository at this point in the history
* feat: introduce n samples parameter

* chore: change learning rate

* feat: allow specifying the dataset name

* feat: add pushing the model to hf

* feat: add pushing the model to hf

* fix: CI

* fix: CI

* chore: fix mypy

* chore: ci

* test: fix test
  • Loading branch information
alaeddine-13 authored Sep 4, 2023
1 parent 2418384 commit ab56079
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 9 deletions.
12 changes: 5 additions & 7 deletions textbook/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Protocol
from typing import Protocol, Optional
import random

from datasets import Dataset, load_dataset
Expand All @@ -19,6 +19,7 @@ def __init__(
self,
tokenizer: PreTrainedTokenizer,
debug: bool = False,
dataset_name: Optional[str] = None,
):
...

Expand All @@ -30,11 +31,7 @@ def gen(n: int = 100_000, upper_bound: int = 512):
random_integer = random.randint(1, upper_bound)
yield {"text": "hello world" * random_integer}

def __init__(
self,
tokenizer: PreTrainedTokenizer,
debug: bool = False,
):
def __init__(self, tokenizer: PreTrainedTokenizer, debug: bool = False, **kwargs):
self.debug = debug

dataset = Dataset.from_generator(self.gen)
Expand Down Expand Up @@ -77,11 +74,12 @@ class ExerciseDatast:
def __init__(
self,
tokenizer: PreTrainedTokenizer,
dataset_name: str = "jinaai/code_exercises_40k",
debug: bool = False,
):
self.debug = debug

dataset = load_dataset("jinaai/code_exercises_40k")["train"]
dataset = load_dataset(dataset_name)["train"]

if debug:
dataset = dataset.select(range(10))
Expand Down
26 changes: 24 additions & 2 deletions textbook/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,22 @@ def train(
epochs: int = 1,
micro_batch_size: int = 1,
batch_size: int = 1,
learning_rate: float = 3e-4,
learning_rate: float = 3e-5,
output_dir: Optional[str] = None,
wandb_run_name: str = "",
use_wandb: bool = False,
wandb_project: str = "textbook",
wandb_log_model: Optional[
bool
] = None, # will be true by default if use_wandb is true
push_model_to_hf: bool = False, # if set, will push the model to hf
local_rank: Annotated[int, typer.Option("--local_rank")] = 0,
deepspeed: Optional[str] = None,
debug: bool = False,
eval_size: Optional[int] = None,
eval_max_new_tokens: int = 512,
n_samples: Optional[int] = None,
dataset_name: Optional[str] = "jinaai/code_exercises_40k",
):
module_cls: Type[BaseModule] = getattr(import_module("textbook.model"), module)
module_instance = module_cls(debug=debug)
Expand All @@ -64,7 +67,17 @@ def train(
dataset_cls: Type[CustomDataset] = getattr(
import_module("textbook.dataset"), dataset
)
dataset_instance = dataset_cls(tokenizer=tokenizer, debug=debug)
if dataset_name:
dataset_instance = dataset_cls(
tokenizer=tokenizer, debug=debug, dataset_name=dataset_name
)
else:
dataset_instance = dataset_cls(tokenizer=tokenizer, debug=debug)

if n_samples:
dataset_instance.train_dataset = dataset_instance.train_dataset.select(
range(n_samples)
)

if debug:
wandb_run_name = "debug"
Expand Down Expand Up @@ -115,6 +128,15 @@ def train(

trainer.train()

if push_model_to_hf:
# Save the pretrained model locally
model.save_pretrained(output_dir) # type: ignore
tokenizer.save_pretrained(output_dir) # type: ignore

# Push to the hub
model.push_to_hub("jinaai/starcoder-1b-textbook") # type: ignore
tokenizer.push_to_hub("jinaai/starcoder-1b-textbook") # type: ignore

accuracy_results, sample_results = evaluate(
model, tokenizer, eval_size=eval_size, max_new_tokens=eval_max_new_tokens
)
Expand Down

0 comments on commit ab56079

Please sign in to comment.