Skip to content

Commit

Permalink
Merge pull request #92 from lean-dojo/peiyang
Browse files Browse the repository at this point in the history
Fix minor bug: tokenizer typo in models.py
  • Loading branch information
Peiyang-Song authored Jul 1, 2024
2 parents 7cd54f6 + 646215e commit 96e3cbe
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions python/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
length_penalty: float = 0.0,
device: str = "cpu",
) -> None:
self.tokenzier = AutoTokenizer.from_pretrained(name)
self.tokenizer = AutoTokenizer.from_pretrained(name)
if device == "auto":
device = get_cuda_if_available()
else:
Expand All @@ -60,7 +60,7 @@ def __init__(
self.length_penalty = length_penalty

def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
tokenized_input = self.tokenzier(input + target_prefix, return_tensors="pt")
tokenized_input = self.tokenizer(input + target_prefix, return_tensors="pt")
output = self.model.generate(
tokenized_input.input_ids.to(self.device),
max_length=self.max_length,
Expand All @@ -72,7 +72,7 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float
return_dict_in_generate=True,
output_scores=True,
)
raw_outputs = self.tokenzier.batch_decode(
raw_outputs = self.tokenizer.batch_decode(
output.sequences, skip_special_tokens=True
)
outputs = []
Expand Down Expand Up @@ -113,7 +113,7 @@ def __init__(
length_penalty: float = 0.0,
device: str = "cpu",
) -> None:
self.tokenzier = AutoTokenizer.from_pretrained(name)
self.tokenizer = AutoTokenizer.from_pretrained(name)
if device == "auto":
device = get_cuda_if_available()
else:
Expand All @@ -128,7 +128,7 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float
assert (
target_prefix == ""
), "target_prefix is not supported by encoder-decoder Transformer"
tokenized_input = self.tokenzier(input, return_tensors="pt")
tokenized_input = self.tokenizer(input, return_tensors="pt")
output = self.model.generate(
tokenized_input.input_ids.to(self.device),
max_length=self.max_length,
Expand All @@ -140,15 +140,15 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float
return_dict_in_generate=True,
output_scores=True,
)
raw_outputs = self.tokenzier.batch_decode(
raw_outputs = self.tokenizer.batch_decode(
output.sequences, skip_special_tokens=True
)
return list(zip(raw_outputs, output.sequences_scores.exp().tolist()))


class EncoderOnlyTransformer(Encoder, Transformer):
def __init__(self, name: str, device: str = "cpu") -> None:
self.tokenzier = AutoTokenizer.from_pretrained(name)
self.tokenizer = AutoTokenizer.from_pretrained(name)
if device == "auto":
device = get_cuda_if_available()
else:
Expand All @@ -158,7 +158,7 @@ def __init__(self, name: str, device: str = "cpu") -> None:

@torch.no_grad()
def encode(self, input: str) -> np.ndarray:
tokenized_input = self.tokenzier(input, return_tensors="pt")
tokenized_input = self.tokenizer(input, return_tensors="pt")
hidden_state = self.model(
tokenized_input.input_ids.to(self.device)
).last_hidden_state
Expand Down

0 comments on commit 96e3cbe

Please sign in to comment.