Skip to content
This repository has been archived by the owner on Sep 7, 2024. It is now read-only.
/ brainhack-24 Public archive
generated from TIL-24/til-24-base

Commit

Permalink
Add new VLM with OWLv2 and Clip
Browse files Browse the repository at this point in the history
Co-authored-by: neosouwchuan <[email protected]>
Co-authored-by: BrianHuBuyan <[email protected]>
  • Loading branch information
3 people committed Jun 2, 2024
1 parent 337aad6 commit 6d44437
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 110 deletions.
4 changes: 2 additions & 2 deletions vlm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ WORKDIR /workspace
# install other requirements
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN apt install libgl1-mesa-glx -y

# copy the rest of the files into the container
COPY src .

# start model service
CMD uvicorn api_service:app --port 5004 --host 0.0.0.0
CMD uvicorn api_service:app --port 5004 --host 0.0.0.0
14 changes: 6 additions & 8 deletions vlm/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
fastapi
gdown==4.5.1
matplotlib>=3.2.2
numpy>=1.18.5,<1.24.0
PyYAML>=5.3.1
opencv-python==4.5.5.64
scipy>=1.4.1
opencv-python==4.9.0.80
opencv-python-headless==4.9.0.80
opencv-contrib-python
git+https://github.com/openai/CLIP.git
--extra-index-url https://download.pytorch.org/whl/cu121
torch==2.3.0+cu121
torchvision==0.18.0+cu121
tqdm>=4.41.0
protobuf<4.21.3
transformers==4.37.0
accelerate
uvicorn[standard]
ultralytics==8.2.27
Pillow
154 changes: 73 additions & 81 deletions vlm/src/VLMManager.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,84 @@
from typing import List
from PIL import Image
from torchvision.transforms.functional import pil_to_tensor
import json
import torch
from transformers import Owlv2Processor, Owlv2ForObjectDetection
from typing import Dict, List, Optional, Tuple, Union
import clip
from PIL import Image
import random
from transformers.image_transforms import center_to_corners_format
from transformers.utils import TensorType
def post_process_object_detection( outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None,n = 1
):
"""
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format.
Args:
outputs ([`OwlViTObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
# TODO: (amy) add support for other frameworks
logits, boxes = outputs.logits, outputs.pred_boxes

# if target_sizes is not None:
# if len(logits) != len(target_sizes):
# raise ValueError(
# "Make sure that you pass in as many target sizes as the batch dimension of the logits"
# )

probs = torch.max(logits, dim=-1)
scores = torch.sigmoid(probs.values)
labels = probs.indices

# Convert to [x0, y0, x1, y1] format
boxes = center_to_corners_format(boxes)

# Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None:
scale_fct = torch.tensor([[1520., 1520., 1520., 1520.] for i in range(n)], device='cuda:0')
boxes = boxes * scale_fct[:, None, :]
#print(boxes)
import os
from os import listdir, path
from os.path import isfile, join
import io
from torchvision import transforms

results = []
for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
label = l[s > threshold]
box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
print([f for f in os.listdir('.') if os.path.isfile(f)])
print(os.getcwd())

return results
from ultralytics import YOLOWorld
class VLMManager:
def __init__(self):
# initialize the model here
self.device = torch.device('cuda')
self.processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")
self.model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16").cuda()
self.device = 'cuda'
self.clippreprocess = transforms.Compose([
transforms.Resize(size=224, interpolation=transforms.InterpolationMode("bicubic"), max_size=None, antialias=True),
transforms.CenterCrop(size=(224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258,0.27577711))
])
print(os.getcwd())
print([f for f in os.listdir('.') if os.path.isfile(f)])
self.clipmodel= torch.load(path.join(path.dirname(path.abspath(__file__)), "clip_ft_2.pt"))
self.objects = ["cargo aircraft","light aircraft","commercial aircraft","drone","missile","helicopter","fighter jet","fighter plane"]
self.model = YOLOWorld(path.join(path.dirname(path.abspath(__file__)), "yoloworldbest2.pt")).to(self.device)
for i in self.clipmodel.parameters():
i.requires_grad=False
for i in self.model.parameters():
i.requires_grad= False
i.requires_grad=False
pass

def identify(self, imagebyte: bytes, caption: str) -> List[int]:
def identify(self, imagebyte: bytes, caption: str):
# perform object detection with a vision-language model
inputs = self.processor(text=caption, images=Image.open(imagebyte), return_tensors="pt").to(self.device)
outputs=self.model(**inputs)
results = post_process_object_detection(outputs=outputs, target_sizes=1, threshold=0.1,n=1)
maxconfidence = 0
bbox = []

#print(groundtruths)
for box, confidence, label in zip(boxes, scores, labels):
box = torch.Tensor.tolist(box)
#print(labels)

if confidence>maxconfidence:
bbox= box
#print(maxscore,findOverlap(box),box)
maxconfidence = confidence
bbox = [int(i) for i in bbox]
bbox[2]-=bbox[0]
bbox[3]-=bbox[1]
if bbox !=[]:
return bbox
else:
return [0,0,0,0]
inputimage = Image.open(io.BytesIO(imagebyte))
out = self.model.predict(inputimage,conf=0.01)
for currindex,i in enumerate(self.objects):
if i in caption:
groundcat = currindex
groundbox = [440, 112, 52, 36]
classlist = out[0].boxes.cls.tolist()
possible = []
for indexx,i in enumerate(classlist):
if i == groundcat:
possible.append(indexx)
bestindex = -1
bboxlist = out[0].boxes.xyxyn.tolist()
tokenizedtext = clip.tokenize([caption]).to(self.device)
clipprob = []
maxscore = 0
for chosenindex in possible:
bbox = bboxlist[chosenindex]
bbox[0]*=1520
bbox[1]*=870
bbox[2]*=1520
bbox[3]*=870
deltax = bbox[2]-bbox[0]
deltay = bbox[3]-bbox[1]
bbox[0]-=deltax/2
bbox[1]-=deltay/2
bbox[2]-=deltax/2
bbox[3]-=deltay/2
croppedimage = inputimage.crop(bbox)
croppedimage = self.clippreprocess(croppedimage).unsqueeze(0).to(self.device)
logits_per_image, logits_per_text = self.clipmodel(croppedimage, tokenizedtext)
probs = logits_per_image.cpu().detach().numpy()
if probs[0][0] > maxscore:
maxscore = probs[0][0]
bestindex = chosenindex
bestbbox = bbox.copy()
#print(bestbbox,groundbox,bestindex)
if bestindex == -1:
bestbbox = random.choice(bboxlist)
bestbbox[2] -= bestbbox[0]
bestbbox[3] -= bestbbox[1]
for i in range(4):
bestbbox[i] = int(bestbbox[i])
try:
return bestbbox
except:
return [0,0,0,0]
19 changes: 0 additions & 19 deletions vlm/src/config.json

This file was deleted.

0 comments on commit 6d44437

Please sign in to comment.