diff --git a/generate.py b/generate.py
index 4d52b4c8b..4c1fb679a 100644
--- a/generate.py
+++ b/generate.py
@@ -31,22 +31,22 @@
 @dataclass
 class GeneratorArgs:
     prompt: str = "torchchat is pronounced torch-chat and is so cool because"
-    chat: bool = (False,)
-    gui: bool = (False,)
-    num_samples: int = (1,)
-    max_new_tokens: int = (200,)
-    top_k: int = (200,)
-    temperature: int = (0,)  # deterministic argmax
-    compile: bool = (False,)
-    compile_prefill: bool = (False,)
-    speculate_k: int = (5,)
+    chat_mode: bool = False
+    gui_mode: bool = False
+    num_samples: int = 1
+    max_new_tokens: int = 200
+    top_k: int = 200
+    temperature: int = 0  # deterministic argmax
+    compile: bool = False
+    compile_prefill: bool = False
+    speculate_k: int = 5
 
     @classmethod
     def from_args(cls, args):  # -> GeneratorArgs:
         return cls(
             prompt=args.prompt,
-            chat=args.chat,
-            gui=args.gui,
+            chat_mode=args.chat,
+            gui_mode=args.gui,
             num_samples=args.num_samples,
             max_new_tokens=args.max_new_tokens,
             top_k=args.top_k,
@@ -316,9 +316,7 @@ def _main(
     builder_args: BuilderArgs,
     speculative_builder_args: BuilderArgs,
     tokenizer_args: TokenizerArgs,
-    prompt: str = "Hello, my name is",
-    chat_mode: bool = False,
-    num_samples: int = 5,
+    generator_args: GeneratorArgs,
     max_new_tokens: int = 100,
     top_k: int = 200,
     temperature: float = 0.8,
@@ -365,7 +363,9 @@ def _main(
     else:
         draft_model = None
 
-    encoded = encode_tokens(tokenizer, prompt, bos=True, device=builder_args.device)
+    encoded = encode_tokens(
+        tokenizer, generator_args.prompt, bos=True, device=builder_args.device
+    )
     print(encoded)
     prompt_length = encoded.size(0)
 
@@ -404,9 +404,9 @@ def _main(
     }
     start = -1 if compile else 0
 
-    for i in range(start, num_samples):
+    for i in range(start, generator_args.num_samples):
         device_sync(device=builder_args.device)
-        if i >= 0 and chat_mode:
+        if i >= 0 and generator_args.chat_mode:
             prompt = input("What is your prompt? ")
             if is_chat:
                 prompt = f"{B_INST} {prompt.strip()} {E_INST}"
@@ -414,7 +414,7 @@ def _main(
                 tokenizer, prompt, bos=True, device=builder_args.device
             )
 
-        if chat_mode and i >= 0:
+        if generator_args.chat_mode and i >= 0:
             buffer = []
             period_id = tokenizer.encode(".")[0]
             done_generating = False
@@ -436,7 +436,7 @@ def callback(x):
         t0 = time.perf_counter()
         import contextlib
 
-        if (i != num_samples - 1 or not profile) or (use_tp and rank != 0):
+        if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0):
             prof = contextlib.nullcontext()
         else:
             torch.profiler._utils._init_for_cuda_graphs()
@@ -448,7 +448,7 @@ def callback(x):
                 max_new_tokens,
                 draft_model=draft_model,
                 speculate_k=speculate_k,
-                chat_mode=chat_mode,
+                chat_mode=generator_args.chat_mode,
                 callback=callback,
                 temperature=temperature,
                 top_k=top_k,
@@ -465,7 +465,7 @@ def callback(x):
         device_sync(device=builder_args.device)
         t = time.perf_counter() - t0
 
-        if not chat_mode:
+        if not generator_args.chat_mode:
             print(tokenizer.decode(y.tolist()))
         else:
             print()
@@ -495,13 +495,13 @@ def main(args):
     builder_args = BuilderArgs.from_args(args)
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
+    generator_args = GeneratorArgs.from_args(args)
+    
     _main(
         builder_args,
         speculative_builder_args,
         tokenizer_args,
-        args.prompt,
-        args.chat,
-        args.num_samples,
+        generator_args,
         args.max_new_tokens,
         args.top_k,
         args.temperature,