Support external models with vllm huggingface openai gemini and claude (

#97) * support external models: vllm huggingface openai * fix prompt for internlm * support gemini claude * fix_bugs * fix_model_bugs * update_requirement * add requirements in README * fixed by suggestions * removed comments
lean-dojo · Sep 1, 2024 · b7e199e · b7e199e
1 parent 5e35567
commit b7e199e
Show file tree

Hide file tree

Showing 10 changed files with 578 additions and 2 deletions.
diff --git a/LeanCopilotTests/ModelAPIs.lean b/LeanCopilotTests/ModelAPIs.lean
@@ -99,4 +99,27 @@ def reproverExternalEncoder : ExternalEncoder := {
 
 -- Go to ./python and run `uvicorn server:app --port 23337`
 #eval encode reproverExternalEncoder "n : ℕ\n⊢ gcd n n = n"
+
+/--
+LLMs apis: openai, claude etc.
+-/
+def gpt4 : ExternalGenerator := {
+  name := "gpt4"
+  host := "localhost"
+  port := 23337
+}
+
+#eval generate gpt4 "n : ℕ\n⊢ gcd n n = n"
+
+/--
+Math LLMs: InternLM, Deepseekmath etc.
+-/
+def internLM : ExternalGenerator := {
+  name := "InternLM"
+  host := "localhost"
+  port := 23337
+}
+
+#eval generate internLM "n : ℕ\n⊢ gcd n n = n"
+
 -/
diff --git a/python/README.md b/python/README.md
@@ -7,7 +7,7 @@ Python Server for External Models
 conda create --name lean-copilot python=3.10 python numpy
 conda activate lean-copilot
 pip install torch --index-url https://download.pytorch.org/whl/cu121  # Depending on whether you have CUDA and the CUDA version; see https://pytorch.org/.
-pip install fastapi uvicorn loguru transformers openai
+pip install fastapi uvicorn loguru transformers openai anthropic google vllm
 ```
 
 

diff --git a/python/external_models/__init__.py b/python/external_models/__init__.py
@@ -0,0 +1,5 @@
+from .oai_runner import OpenAIRunner
+from .hf_runner import HFTacticGenerator
+from .vllm_runner import VLLMTacticGenerator
+from .claude_runner import ClaudeRunner
+from .gemini_runner import GeminiRunner
diff --git a/python/external_models/claude_runner.py b/python/external_models/claude_runner.py
@@ -0,0 +1,59 @@
+import torch
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoModelForTextEncoding,
+)
+import os
+import numpy as np
+try:
+    from anthropic import Anthropic
+except ImportError as e:
+    pass
+from .external_parser import *
+
+
+class ClaudeRunner(Generator, Transformer):
+    client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
+
+    def __init__(self, **args):
+        self.client_kwargs: dict[str | str] = {
+            "model": args['model'],
+            "temperature": args['temperature'],
+            "max_tokens": args['max_tokens'],
+            "top_p": args['top_p'],
+            }
+        self.name = self.client_kwargs["model"]
+
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        prompt = pre_process_input(self.name, input + target_prefix)
+
+        try:
+            response = self.client.completions.create(
+                    prompt=prompt,
+                    **self.client_kwargs,
+                )                                
+            content = response.completion
+
+        except Exception as e:
+            raise e
+
+        results = [(post_process_output(self.name, content),1.0)]# current claude only supports one output
+        return choices_dedup(results)
+
+
+if __name__ == "__main__":
+
+    generation_kwargs = {"model": "claude-3-opus",
+                         "temperature": 0.9,
+                         "max_tokens": 1024,
+                         "top_p": 0.9,
+                         }
+
+    model = ClaudeRunner(**generation_kwargs)
+    print(model.generate("n : ℕ\n⊢ gcd n n = n"))
diff --git a/python/external_models/external_parser.py b/python/external_models/external_parser.py
@@ -0,0 +1,81 @@
+import os
+import torch
+import argparse
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoModelForTextEncoding,
+)
+
+
+def get_cuda_if_available():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def pre_process_input(model_name, input):
+    if model_name == "internlm/internlm2-math-plus-1_8b":
+        prompt="My LEAN 4 state is:\n```lean\n" + input + \
+        "```\nPlease predict a possible tactic to help me prove the theorem."
+        prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
+    elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
+        prompt = 'Here is a theorom you need to prove in Lean:\n' + \
+            input+'\nNow you should suggest one line tactic in lean code:'
+    elif 'gemini' in model_name  or "claude" in model_name:
+        prompt = 'Here is a theorom you need to prove in Lean:\n' + \
+            input+'\nNow you should suggest one line tactic in lean code:'
+    else:
+        raise NotImplementedError(f"External model '{model_name}' not supported")
+    return prompt
+
+
+def post_process_output(model_name, output):
+    if model_name == "internlm/internlm2-math-plus-1_8b":
+        result = output.split(
+            'assistant')[-1].split('lean')[-1].split('```')[0].split('\n')[1]
+    elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
+        result = output.split('lean')[-1].split('```')[0].split('\n')[1]
+    elif 'gemini' in model_name  or "claude" in model_name: 
+        result = output.split('lean')[-1].split('```')[0].split('\n')[1]
+    else:
+        raise NotImplementedError(f"External model '{model_name}' not supported")
+    return result
+
+
+def choices_dedup(output_list: List[tuple[str, float]]) -> List[tuple[str, float]]:
+    unique_data = {}
+    for item in output_list:
+        if item[0] not in unique_data or item[1] > unique_data[item[0]]:
+            unique_data[item[0]] = item[1]
+    sorted_data = sorted(unique_data.items(), key=lambda x: x[1], reverse=True)
+    return sorted_data
+
+
+class Generator(ABC):
+    @abstractmethod
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        pass
+
+
+class Encoder(ABC):
+    @abstractmethod
+    def encode(self, input: str) -> np.ndarray:
+        pass
+
+
+class Transformer:
+    def cuda(self) -> None:
+        self.model.cuda()
+
+    def cpu(self) -> None:
+        self.model.cpu()
+
+    @property
+    def device(self) -> torch.device:
+        return self.model.device
+
+
diff --git a/python/external_models/gemini_runner.py b/python/external_models/gemini_runner.py
@@ -0,0 +1,85 @@
+import torch
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoModelForTextEncoding,
+)
+import os
+import numpy as np
+from .external_parser import *
+
+try:
+    import google.generativeai as genai
+    from google.generativeai import GenerationConfig
+except ImportError as e:
+    pass
+
+
+class GeminiRunner(Generator, Transformer):
+    client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+    safety_settings = [
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+        },]
+    def __init__(self, **args):
+
+        self.client_kwargs: dict[str | str] = {
+            "model": args['model'],
+            "temperature": args['temperature'],
+            "max_tokens": args['max_tokens'],
+            "top_p": args['top_p'],
+
+        }
+        self.name = self.client_kwargs["model"]
+
+        self.client = genai.GenerativeModel(args['model'])
+        self.generation_config = GenerationConfig(
+            candidate_count=1,
+            max_output_tokens=args['max_tokens'],
+            temperature=args['temperature'],
+            top_p=args['top_p'],
+        )
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        prompt = pre_process_input(self.name, input + target_prefix)
+
+
+        response = self.client.generate_content(
+                    prompt,
+                    generation_config=self.generation_config,
+                    safety_settings=GeminiRunner.safety_settings,
+                )
+
+
+
+        results = [(post_process_output(self.name, response.text),1.0)]# current gemini only supports one output
+        return choices_dedup(results)
+
+
+if __name__ == "__main__":
+
+    generation_kwargs = {"model": 'gemini-1.0-pro',
+                         "temperature": 0.9,
+                         "max_tokens": 1024,
+                         "top_p": 0.9,
+                         }
+
+    model = GeminiRunner(**generation_kwargs)
+    print(model.generate("n : ℕ\n⊢ gcd n n = n"))
diff --git a/python/external_models/hf_runner.py b/python/external_models/hf_runner.py
@@ -0,0 +1,96 @@
+import torch
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoModelForTextEncoding,
+)
+import os
+import numpy as np
+
+import openai
+from openai import OpenAI
+from .external_parser import *
+
+
+class HFTacticGenerator(Generator, Transformer):
+    def __init__(
+        self,
+        **args
+    ) -> None:
+        self.name = args['model']
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.name, trust_remote_code=True)
+        device = args['device']
+        if device == "auto":
+            device = get_cuda_if_available()
+        else:
+            device = torch.device(device)
+        logger.info(f"Loading {self.name} on {device}")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.name, trust_remote_code=True).to(device)
+
+        self.generation_args: dict[str | str] = {
+            "do_sample": args["do_sample"],
+            "temperature": args['temperature'],  # chat default is 0.8
+            "max_new_tokens": args['max_new_tokens'],
+            "top_p": args['top_p'],  # chat default is 0.8
+            # "length_penalty": args["length_penalty"],
+            "num_return_sequences": args['num_return_sequences'],
+            # "num_beams": self.num_return_sequences,
+            # Here if we use beam search for llms the output are not diverse(few tactics are provided).
+            "output_scores": args["output_scores"],
+            "output_logits": args["output_logits"],
+            "return_dict_in_generate": args["return_dict_in_generate"],
+        }
+
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        prompt = input + target_prefix
+        '''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:'
+        prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
+        '''
+        prompt = pre_process_input(self.name, prompt)
+
+        self.model = self.model.eval()
+
+        tokenized_input = self.tokenizer(prompt, return_tensors="pt")
+        eos_token_id = [self.tokenizer.eos_token_id,
+                        self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+        outputs = self.model.generate(
+            tokenized_input.input_ids.to(self.device),
+            eos_token_id=eos_token_id,
+            **self.generation_args
+        )
+        response = self.tokenizer.batch_decode(
+            outputs['sequences'], skip_special_tokens=True)
+        result = []
+        index = 0
+        for out, score in zip(response, outputs.scores):
+            out = post_process_output(self.name, out)
+            result.append((out, score[index].exp().sum().log().cpu().item()))
+            index += 1
+        result = choices_dedup(result)
+        return result
+
+
+if __name__ == "__main__":
+
+    generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b",
+                         "temperature": 0.6,
+                         "max_new_tokens": 1024,
+                         "top_p": 0.9,
+                         "length_penalty": 0,
+                         "num_return_sequences": 64,
+                         "do_sample": True,
+                         "output_scores": True,
+                         "output_logits": False,
+                         "return_dict_in_generate": True,
+                         "device": "auto",
+                         }
+    model = HFTacticGenerator(**generation_kwargs)
+    model.cuda()
+    print(model.generate("n : ℕ\n⊢ gcd n n = n"))