From 4172ce820daa473fbe9f844b3210d2e6827640a4 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Fri, 27 Sep 2024 01:50:44 +0200 Subject: [PATCH] Model llama 3.2 (#734) * fixed typo * added llama 3.2-1b * configured 3b * configured instruct models --- transformer_lens/HookedEncoder.py | 2 +- transformer_lens/loading_from_pretrained.py | 80 +++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/transformer_lens/HookedEncoder.py b/transformer_lens/HookedEncoder.py index 59ede19af..a0118d0fc 100644 --- a/transformer_lens/HookedEncoder.py +++ b/transformer_lens/HookedEncoder.py @@ -255,7 +255,7 @@ def from_pretrained( if move_to_device: model.to(cfg.device) - print(f"Loaded pretrained model {model_name} into HookedTransformer") + print(f"Loaded pretrained model {model_name} into HookedEncoder") return model diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index e7ebea947..cc0295323 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -151,6 +151,10 @@ "meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-70B", "meta-llama/Meta-Llama-3-70B-Instruct", + "meta-llama/Llama-3.2-1B", + "meta-llama/Llama-3.2-3B", + "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct", "Baidicoot/Othello-GPT-Transformer-Lens", "bert-base-cased", "roneneldan/TinyStories-1M", @@ -885,6 +889,82 @@ def convert_hf_model_config(model_name: str, **kwargs): "final_rms": True, "gated_mlp": True, } + elif "Llama-3.2-1B" in official_model_name: + cfg_dict = { + "d_model": 2048, + "d_head": 64, + "n_heads": 32, + "d_mlp": 8192, + "n_layers": 16, + "n_ctx": 2048, # capped due to memory issues + "eps": 1e-5, + "d_vocab": 128256, + "act_fn": "silu", + "n_key_value_heads": 8, + "normalization_type": "RMS", + "positional_embedding_type": "rotary", + "rotary_adjacent_pairs": False, + "rotary_dim": 64, + "final_rms": True, + "gated_mlp": True, + } + elif "Llama-3.2-3B" in official_model_name: + cfg_dict = { + "d_model": 3072, + "d_head": 128, + "n_heads": 24, + "d_mlp": 8192, + "n_layers": 28, + "n_ctx": 2048, # capped due to memory issues + "eps": 1e-5, + "d_vocab": 128256, + "act_fn": "silu", + "n_key_value_heads": 8, + "normalization_type": "RMS", + "positional_embedding_type": "rotary", + "rotary_adjacent_pairs": False, + "rotary_dim": 128, + "final_rms": True, + "gated_mlp": True, + } + elif "Llama-3.2-1B-Instruct" in official_model_name: + cfg_dict = { + "d_model": 2048, + "d_head": 64, + "n_heads": 32, + "d_mlp": 8192, + "n_layers": 16, + "n_ctx": 2048, # capped due to memory issues + "eps": 1e-5, + "d_vocab": 128256, + "act_fn": "silu", + "n_key_value_heads": 8, + "normalization_type": "RMS", + "positional_embedding_type": "rotary", + "rotary_adjacent_pairs": False, + "rotary_dim": 64, + "final_rms": True, + "gated_mlp": True, + } + elif "Llama-3.2-3B-Instruct" in official_model_name: + cfg_dict = { + "d_model": 3072, + "d_head": 128, + "n_heads": 24, + "d_mlp": 8192, + "n_layers": 28, + "n_ctx": 2048, # capped due to memory issues + "eps": 1e-5, + "d_vocab": 128256, + "act_fn": "silu", + "n_key_value_heads": 8, + "normalization_type": "RMS", + "positional_embedding_type": "rotary", + "rotary_adjacent_pairs": False, + "rotary_dim": 128, + "final_rms": True, + "gated_mlp": True, + } elif architecture == "GPTNeoForCausalLM": cfg_dict = { "d_model": hf_config.hidden_size,