[CLEANUP]

The-Swarm-Corporation · Mar 18, 2024 · 66c65b0 · 66c65b0
1 parent 8c14701
commit 66c65b0
Show file tree

Hide file tree

Showing 11 changed files with 41 additions and 502 deletions.
diff --git a/README.md b/README.md
@@ -1,17 +1,15 @@
 [![Multi-Modality](agorabanner.png)](https://discord.gg/qUtxnK2NMf)
 
 # Swarms Cloud
-- Swarms-as-a-service
-- 100% uptime
-- Bleeding-Edge Performance
-- Production-Grade Reliability
-
+Infrastructure for scalable, reliable, and economical Multi-Modal Model API serving and deployment. We're using terraform to orchestrate infrastructure, FastAPI to host the models. If you're into deploying models for millions of people, join our discord and help contribute.
+
+
 # Install
 `pip install swarms-cloud`
 
-# Models
+# Examples
 
-## Example API Request in Python
+### Example Python
 ```python
 import requests
 import base64
@@ -262,15 +260,5 @@ print(out)
 - Terraform
 
 
-# Example
-
-scripts/send_request_to_cogvlm.py
-```
-url = "https://api.swarms.world/v1/chat/completions"
-
-response = requests.post(url, json=request_data)
-
-print(response.text)
-```
 # License
 MIT
diff --git a/servers/llms/llm.py b/servers/llms/llm.py
@@ -17,7 +17,7 @@
 )
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.serve.async_engine import AsyncEngine
-from swarms_cloud.openai_protocol import (  # noqa: E501
+from swarms_cloud.schema.openai_protocol import (  # noqa: E501
     ChatCompletionRequest,
     ChatCompletionRequestQos,
     ChatCompletionResponse,

diff --git a/servers/qwen/qwen_api_openai_compatible.py b/servers/qwen/qwen_api_openai_compatible.py
@@ -1,7 +1,6 @@
 import base64
 import re
 import gc
-import json
 import copy
 import os
 import time
@@ -36,7 +35,6 @@
 QUANT_ENABLED = os.environ.get("QUANT_ENABLED", True)
 
 
-
 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
     torch_type = torch.bfloat16
 else:
@@ -45,7 +43,12 @@
 print(f"========Use torch type as:{torch_type} with device:{DEVICE}========\n\n")
 
 # Model and tokenizer
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True, load_in_4bit=True, torch_dtype=torch_type)
+tokenizer = AutoTokenizer.from_pretrained(
+    "Qwen/Qwen-VL-Chat",
+    trust_remote_code=True,
+    load_in_4bit=True,
+    torch_dtype=torch_type,
+)
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH,
@@ -173,7 +176,6 @@ class ChatCompletionResponseChoice(BaseModel):
     message: ChatMessageResponse
 
 
-
 class ChatCompletionResponseStreamChoice(BaseModel):
     index: int
     delta: DeltaMessage
@@ -216,6 +218,7 @@ class ChatCompletionResponse(BaseModel):
 
 _TEXT_COMPLETION_CMD = object()
 
+
 @app.get("/v1/models", response_model=ModelList)
 async def list_models():
     """
@@ -389,7 +392,7 @@ def parse_messages(messages):
     if all(m.role != "user" for m in messages):
         raise HTTPException(
             status_code=400,
-            detail=f"Invalid request: Expecting at least one user message.",
+            detail="Invalid request: Expecting at least one user message.",
         )
 
     messages = copy.deepcopy(messages)
@@ -415,7 +418,7 @@ def parse_messages(messages):
             if (len(messages) == 0) or (messages[-1].role != "assistant"):
                 raise HTTPException(
                     status_code=400,
-                    detail=f"Invalid request: Expecting role assistant before role function.",
+                    detail="Invalid request: Expecting role assistant before role function.",
                 )
             messages[-1].content += f"\nObservation: {content}"
             if m_idx == len(_messages) - 1:
@@ -424,13 +427,15 @@ def parse_messages(messages):
             if len(messages) == 0:
                 raise HTTPException(
                     status_code=400,
-                    detail=f"Invalid request: Expecting role user before role assistant.",
+                    detail="Invalid request: Expecting role user before role assistant.",
                 )
             last_msg = messages[-1].content
-            last_msg_has_zh = len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
+            len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
             if messages[-1].role == "user":
                 messages.append(
-                    ChatMessageInput(role="assistant", content=content.lstrip("\n").rstrip())
+                    ChatMessageInput(
+                        role="assistant", content=content.lstrip("\n").rstrip()
+                    )
                 )
             else:
                 messages[-1].content += content
@@ -475,7 +480,6 @@ def parse_messages(messages):
     return query, history
 
 
-
 def process_history_and_images(
     messages: List[ChatMessageInput],
 ) -> Tuple[Optional[str], Optional[List[Tuple[str, str]]], Optional[List[Image.Image]]]:
@@ -545,7 +549,6 @@ def generate_stream_cogvlm(
     Generates a stream of responses using the CogVLM model in inference mode.
     It's optimized to handle continuous input-output interactions with the model in a streaming manner.
     """
-
 
     messages = params["messages"]
     temperature = float(params.get("temperature", 1.0))
@@ -555,12 +558,14 @@ def generate_stream_cogvlm(
     query, history, image_list = process_history_and_images(messages)
     logger.debug(f"==== request ====\n{query}")
     # Save the image temporarily
-    temp_image_path = 'temp_image.jpg'  # Define a temporary file path
+    temp_image_path = "temp_image.jpg"  # Define a temporary file path
     image_list[0].save(temp_image_path)  # Assuming image_list[0] is a PIL Image object
-    inputs = tokenizer.from_list_format([
-        {"text": query},
-        {"image": temp_image_path},
-    ])
+    inputs = tokenizer.from_list_format(
+        [
+            {"text": query},
+            {"image": temp_image_path},
+        ]
+    )
 
     streamer = TextIteratorStreamer(
         tokenizer=tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
@@ -575,12 +580,8 @@ def generate_stream_cogvlm(
     if temperature > 1e-5:
         gen_kwargs["temperature"] = temperature
 
-    total_len = 0
-    generated_text = ""
     response = model.chat(tokenizer, query=inputs, history=None)
-    ret = {
-        "text": item[0] for item in response
-    }
+    ret = {"text": item[0] for item in response}
     yield ret
 
 

diff --git a/servers/qwen/qwenvl_api.py b/servers/qwen/qwenvl_api.py
@@ -16,7 +16,6 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-from sse_starlette.sse import EventSourceResponse
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.generation import GenerationConfig
 
@@ -160,7 +159,7 @@ def parse_messages(messages, functions):
     if all(m.role != "user" for m in messages):
         raise HTTPException(
             status_code=400,
-            detail=f"Invalid request: Expecting at least one user message.",
+            detail="Invalid request: Expecting at least one user message.",
         )
 
     messages = copy.deepcopy(messages)
@@ -214,7 +213,7 @@ def parse_messages(messages, functions):
             if (len(messages) == 0) or (messages[-1].role != "assistant"):
                 raise HTTPException(
                     status_code=400,
-                    detail=f"Invalid request: Expecting role assistant before role function.",
+                    detail="Invalid request: Expecting role assistant before role function.",
                 )
             messages[-1].content += f"\nObservation: {content}"
             if m_idx == len(_messages) - 1:
@@ -223,7 +222,7 @@ def parse_messages(messages, functions):
             if len(messages) == 0:
                 raise HTTPException(
                     status_code=400,
-                    detail=f"Invalid request: Expecting role user before role assistant.",
+                    detail="Invalid request: Expecting role user before role assistant.",
                 )
             last_msg = messages[-1].content
             last_msg_has_zh = len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
@@ -368,7 +367,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
             )
         # generate = predict(query, history, request.model, stop_words)
         # return EventSourceResponse(generate, media_type="text/event-stream")
-        raise HTTPException(status_code=400, detail="Stream request is not supported currently.")
+        raise HTTPException(
+            status_code=400, detail="Stream request is not supported currently."
+        )
 
     stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
     if query is _TEXT_COMPLETION_CMD:
@@ -408,7 +409,7 @@ async def predict(
     chunk = ChatCompletionResponse(
         model=model_id, choices=[choice_data], object="chat.completion.chunk"
     )
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+    yield f"{chunk.model_dump_json(exclude_unset=True)}"
 
     current_length = 0
     stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
@@ -434,15 +435,15 @@ async def predict(
         chunk = ChatCompletionResponse(
             model=model_id, choices=[choice_data], object="chat.completion.chunk"
         )
-        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+        yield f"{chunk.model_dump_json(exclude_unset=True)}"
 
     choice_data = ChatCompletionResponseStreamChoice(
         index=0, delta=DeltaMessage(), finish_reason="stop"
     )
     chunk = ChatCompletionResponse(
         model=model_id, choices=[choice_data], object="chat.completion.chunk"
     )
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+    yield f"{chunk.model_dump_json(exclude_unset=True)}"
     yield "[DONE]"
 
 
@@ -500,4 +501,4 @@ def _get_args():
         resume_download=True,
     )
 
-    uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)
+    uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)
diff --git a/swarms_cloud/__init__.py b/swarms_cloud/__init__.py
@@ -2,7 +2,7 @@
 from swarms_cloud.func_api_wrapper import SwarmCloud
 from swarms_cloud.rate_limiter import rate_limiter
 from swarms_cloud.sky_api import SkyInterface
-from swarms_cloud.openai_protocol import (  # noqa: E501
+from swarms_cloud.schema.openai_protocol import (  # noqa: E501
     ChatCompletionRequest,
     ChatCompletionRequestQos,
     ChatCompletionResponse,
@@ -38,7 +38,7 @@
     check_request,
 )
 
-from swarms_cloud.openai_spec import (
+from swarms_cloud.schema.openai_spec import (
     InputOpenAISpec,
     OutputOpenAISpec,
     OpenAIAPIWrapper,

diff --git a/swarms_cloud/api_utils.py b/swarms_cloud/api_utils.py
@@ -5,7 +5,7 @@
 from fastapi.responses import JSONResponse
 from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 
-from swarms_cloud.openai_protocol import (  # noqa: E501
+from swarms_cloud.schema.openai_protocol import (  # noqa: E501
     ErrorResponse,
 )
 

diff --git a/swarms_cloud/schema/__init__.py b/swarms_cloud/schema/__init__.py
diff --git a/swarms_cloud/openai_protocol.py → swarms_cloud/schema/openai_protocol.py b/swarms_cloud/openai_protocol.py → swarms_cloud/schema/openai_protocol.py
diff --git a/swarms_cloud/openai_spec.py → swarms_cloud/schema/openai_spec.py b/swarms_cloud/openai_spec.py → swarms_cloud/schema/openai_spec.py