diff --git a/generate.py b/generate.py index a1ede96e3..fe7182a6f 100644 --- a/generate.py +++ b/generate.py @@ -435,7 +435,9 @@ def callback(x): import contextlib generator_args.encoded_prompt = encoded - if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0): + if (i != generator_args.num_samples - 1 or not profile) or ( + use_tp and rank != 0 + ): prof = contextlib.nullcontext() else: torch.profiler._utils._init_for_cuda_graphs() @@ -495,7 +497,7 @@ def main(args): speculative_builder_args = BuilderArgs.from_speculative_args(args) tokenizer_args = TokenizerArgs.from_args(args) generator_args = GeneratorArgs.from_args(args) - + _main( builder_args, speculative_builder_args, diff --git a/quantize.py b/quantize.py index c046ebb97..0753583e5 100644 --- a/quantize.py +++ b/quantize.py @@ -515,11 +515,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: def replace_embedding_weight_only_grouped_int8_per_channel( - module, - device, - bitwidth: int = 8, - groupsize: Optional[int] = None, - packed=False + module, device, bitwidth: int = 8, groupsize: Optional[int] = None, packed=False ): for name, child in module.named_children(): # print(f"name: {name}")