Skip to content

Commit

Permalink
[CLEANUP]
Browse files Browse the repository at this point in the history
  • Loading branch information
Kye committed Mar 18, 2024
1 parent 8c14701 commit 66c65b0
Show file tree
Hide file tree
Showing 11 changed files with 41 additions and 502 deletions.
22 changes: 5 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
[![Multi-Modality](agorabanner.png)](https://discord.gg/qUtxnK2NMf)

# Swarms Cloud
- Swarms-as-a-service
- 100% uptime
- Bleeding-Edge Performance
- Production-Grade Reliability

Infrastructure for scalable, reliable, and economical Multi-Modal Model API serving and deployment. We're using terraform to orchestrate infrastructure, FastAPI to host the models. If you're into deploying models for millions of people, join our discord and help contribute.


# Install
`pip install swarms-cloud`

# Models
# Examples

## Example API Request in Python
### Example Python
```python
import requests
import base64
Expand Down Expand Up @@ -262,15 +260,5 @@ print(out)
- Terraform


# Example

scripts/send_request_to_cogvlm.py
```
url = "https://api.swarms.world/v1/chat/completions"
response = requests.post(url, json=request_data)
print(response.text)
```
# License
MIT
2 changes: 1 addition & 1 deletion servers/llms/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)
from lmdeploy.model import ChatTemplateConfig
from lmdeploy.serve.async_engine import AsyncEngine
from swarms_cloud.openai_protocol import ( # noqa: E501
from swarms_cloud.schema.openai_protocol import ( # noqa: E501
ChatCompletionRequest,
ChatCompletionRequestQos,
ChatCompletionResponse,
Expand Down
43 changes: 22 additions & 21 deletions servers/qwen/qwen_api_openai_compatible.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import base64
import re
import gc
import json
import copy
import os
import time
Expand Down Expand Up @@ -36,7 +35,6 @@
QUANT_ENABLED = os.environ.get("QUANT_ENABLED", True)



if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
torch_type = torch.bfloat16
else:
Expand All @@ -45,7 +43,12 @@
print(f"========Use torch type as:{torch_type} with device:{DEVICE}========\n\n")

# Model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True, load_in_4bit=True, torch_dtype=torch_type)
tokenizer = AutoTokenizer.from_pretrained(
"Qwen/Qwen-VL-Chat",
trust_remote_code=True,
load_in_4bit=True,
torch_dtype=torch_type,
)

model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
Expand Down Expand Up @@ -173,7 +176,6 @@ class ChatCompletionResponseChoice(BaseModel):
message: ChatMessageResponse



class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
Expand Down Expand Up @@ -216,6 +218,7 @@ class ChatCompletionResponse(BaseModel):

_TEXT_COMPLETION_CMD = object()


@app.get("/v1/models", response_model=ModelList)
async def list_models():
"""
Expand Down Expand Up @@ -389,7 +392,7 @@ def parse_messages(messages):
if all(m.role != "user" for m in messages):
raise HTTPException(
status_code=400,
detail=f"Invalid request: Expecting at least one user message.",
detail="Invalid request: Expecting at least one user message.",
)

messages = copy.deepcopy(messages)
Expand All @@ -415,7 +418,7 @@ def parse_messages(messages):
if (len(messages) == 0) or (messages[-1].role != "assistant"):
raise HTTPException(
status_code=400,
detail=f"Invalid request: Expecting role assistant before role function.",
detail="Invalid request: Expecting role assistant before role function.",
)
messages[-1].content += f"\nObservation: {content}"
if m_idx == len(_messages) - 1:
Expand All @@ -424,13 +427,15 @@ def parse_messages(messages):
if len(messages) == 0:
raise HTTPException(
status_code=400,
detail=f"Invalid request: Expecting role user before role assistant.",
detail="Invalid request: Expecting role user before role assistant.",
)
last_msg = messages[-1].content
last_msg_has_zh = len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
if messages[-1].role == "user":
messages.append(
ChatMessageInput(role="assistant", content=content.lstrip("\n").rstrip())
ChatMessageInput(
role="assistant", content=content.lstrip("\n").rstrip()
)
)
else:
messages[-1].content += content
Expand Down Expand Up @@ -475,7 +480,6 @@ def parse_messages(messages):
return query, history



def process_history_and_images(
messages: List[ChatMessageInput],
) -> Tuple[Optional[str], Optional[List[Tuple[str, str]]], Optional[List[Image.Image]]]:
Expand Down Expand Up @@ -545,7 +549,6 @@ def generate_stream_cogvlm(
Generates a stream of responses using the CogVLM model in inference mode.
It's optimized to handle continuous input-output interactions with the model in a streaming manner.
"""


messages = params["messages"]
temperature = float(params.get("temperature", 1.0))
Expand All @@ -555,12 +558,14 @@ def generate_stream_cogvlm(
query, history, image_list = process_history_and_images(messages)
logger.debug(f"==== request ====\n{query}")
# Save the image temporarily
temp_image_path = 'temp_image.jpg' # Define a temporary file path
temp_image_path = "temp_image.jpg" # Define a temporary file path
image_list[0].save(temp_image_path) # Assuming image_list[0] is a PIL Image object
inputs = tokenizer.from_list_format([
{"text": query},
{"image": temp_image_path},
])
inputs = tokenizer.from_list_format(
[
{"text": query},
{"image": temp_image_path},
]
)

streamer = TextIteratorStreamer(
tokenizer=tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
Expand All @@ -575,12 +580,8 @@ def generate_stream_cogvlm(
if temperature > 1e-5:
gen_kwargs["temperature"] = temperature

total_len = 0
generated_text = ""
response = model.chat(tokenizer, query=inputs, history=None)
ret = {
"text": item[0] for item in response
}
ret = {"text": item[0] for item in response}
yield ret


Expand Down
19 changes: 10 additions & 9 deletions servers/qwen/qwenvl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from sse_starlette.sse import EventSourceResponse
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.generation import GenerationConfig

Expand Down Expand Up @@ -160,7 +159,7 @@ def parse_messages(messages, functions):
if all(m.role != "user" for m in messages):
raise HTTPException(
status_code=400,
detail=f"Invalid request: Expecting at least one user message.",
detail="Invalid request: Expecting at least one user message.",
)

messages = copy.deepcopy(messages)
Expand Down Expand Up @@ -214,7 +213,7 @@ def parse_messages(messages, functions):
if (len(messages) == 0) or (messages[-1].role != "assistant"):
raise HTTPException(
status_code=400,
detail=f"Invalid request: Expecting role assistant before role function.",
detail="Invalid request: Expecting role assistant before role function.",
)
messages[-1].content += f"\nObservation: {content}"
if m_idx == len(_messages) - 1:
Expand All @@ -223,7 +222,7 @@ def parse_messages(messages, functions):
if len(messages) == 0:
raise HTTPException(
status_code=400,
detail=f"Invalid request: Expecting role user before role assistant.",
detail="Invalid request: Expecting role user before role assistant.",
)
last_msg = messages[-1].content
last_msg_has_zh = len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
Expand Down Expand Up @@ -368,7 +367,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
)
# generate = predict(query, history, request.model, stop_words)
# return EventSourceResponse(generate, media_type="text/event-stream")
raise HTTPException(status_code=400, detail="Stream request is not supported currently.")
raise HTTPException(
status_code=400, detail="Stream request is not supported currently."
)

stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
if query is _TEXT_COMPLETION_CMD:
Expand Down Expand Up @@ -408,7 +409,7 @@ async def predict(
chunk = ChatCompletionResponse(
model=model_id, choices=[choice_data], object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield f"{chunk.model_dump_json(exclude_unset=True)}"

current_length = 0
stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
Expand All @@ -434,15 +435,15 @@ async def predict(
chunk = ChatCompletionResponse(
model=model_id, choices=[choice_data], object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield f"{chunk.model_dump_json(exclude_unset=True)}"

choice_data = ChatCompletionResponseStreamChoice(
index=0, delta=DeltaMessage(), finish_reason="stop"
)
chunk = ChatCompletionResponse(
model=model_id, choices=[choice_data], object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield f"{chunk.model_dump_json(exclude_unset=True)}"
yield "[DONE]"


Expand Down Expand Up @@ -500,4 +501,4 @@ def _get_args():
resume_download=True,
)

uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)
uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)
4 changes: 2 additions & 2 deletions swarms_cloud/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from swarms_cloud.func_api_wrapper import SwarmCloud
from swarms_cloud.rate_limiter import rate_limiter
from swarms_cloud.sky_api import SkyInterface
from swarms_cloud.openai_protocol import ( # noqa: E501
from swarms_cloud.schema.openai_protocol import ( # noqa: E501
ChatCompletionRequest,
ChatCompletionRequestQos,
ChatCompletionResponse,
Expand Down Expand Up @@ -38,7 +38,7 @@
check_request,
)

from swarms_cloud.openai_spec import (
from swarms_cloud.schema.openai_spec import (
InputOpenAISpec,
OutputOpenAISpec,
OpenAIAPIWrapper,
Expand Down
2 changes: 1 addition & 1 deletion swarms_cloud/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fastapi.responses import JSONResponse
from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer

from swarms_cloud.openai_protocol import ( # noqa: E501
from swarms_cloud.schema.openai_protocol import ( # noqa: E501
ErrorResponse,
)

Expand Down
Empty file added swarms_cloud/schema/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 66c65b0

Please sign in to comment.