Spaces:
Running
on
L4
Running
on
L4
import base64 | |
import cv2 | |
import numpy as np | |
from openai import OpenAI | |
from neus_v.calibration.vlm import smooth_mapping | |
from neus_v.vlm.obj import DetectedObject | |
class VLLMClient: | |
def __init__( | |
self, | |
api_key="EMPTY", | |
api_base="http://localhost:8000/v1", | |
model="OpenGVLab/InternVL2_5-8B", | |
): | |
self.client = OpenAI(api_key=api_key, base_url=api_base) | |
self.model = model | |
self.system_message = ( | |
"You must only return a Yes or No, and not both, to any question asked.\n" | |
"You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n" | |
"For example, if the question is 'Is there a cat present in the Image?', the answer must only be 'Yes' or 'No'." | |
) | |
# def _encode_frame(self, frame): | |
# return base64.b64encode(frame.tobytes()).decode("utf-8") | |
def _encode_frame(self, frame): | |
# Encode a uint8 numpy array (image) as a JPEG and then base64 encode it. | |
ret, buffer = cv2.imencode(".jpg", frame) | |
if not ret: | |
raise ValueError("Could not encode frame") | |
return base64.b64encode(buffer).decode("utf-8") | |
def detect(self, seq_of_frames, scene_description, threshold=None, **kwargs): | |
# Encode each frame. | |
encoded_images = [self._encode_frame(frame) for frame in seq_of_frames] | |
# Build the user message: a text prompt plus one image for each frame. | |
user_content = [ | |
{ | |
"type": "text", | |
"text": f"Does the sequence of these images depict '{scene_description}'", | |
} | |
] | |
for encoded in encoded_images: | |
user_content.append( | |
{ | |
"type": "image_url", | |
"image_url": {"url": f"data:image/jpeg;base64,{encoded}"}, | |
} | |
) | |
# Create a chat completion request. | |
chat_response = self.client.chat.completions.create( | |
model=self.model, | |
messages=[ | |
{"role": "system", "content": self.system_message}, | |
{"role": "user", "content": user_content}, | |
], | |
max_tokens=1, | |
temperature=0.0, | |
logprobs=True, | |
top_logprobs=20, | |
) | |
# Retrieve the list of TopLogprob objects. | |
top_logprobs_list = chat_response.choices[0].logprobs.content[0].top_logprobs | |
# Build a mapping from token text (stripped) to its probability. | |
token_prob_map = {} | |
for top_logprob in top_logprobs_list: | |
token_text = top_logprob.token.strip() | |
token_prob_map[token_text] = np.exp(top_logprob.logprob) | |
# Extract probabilities for "Yes" and "No" | |
yes_prob = token_prob_map.get("Yes", 0.0) | |
no_prob = token_prob_map.get("No", 0.0) | |
# Compute the normalized probability for "Yes": p_yes / (p_yes + p_no) | |
if yes_prob + no_prob > 0: | |
confidence = yes_prob / (yes_prob + no_prob) | |
else: | |
raise ValueError("No probabilities for 'Yes' or 'No' found in the response.") | |
if threshold: | |
confidence = smooth_mapping(confidence=confidence, false_threshold=threshold) | |
if confidence < threshold: | |
no_prob = 1.0 | |
return DetectedObject( | |
name=scene_description, | |
model_name=self.model, | |
confidence=round(confidence, 3), | |
probability=round(confidence, 3), | |
number_of_detection=1, | |
is_detected=yes_prob > no_prob, # TODO: Check if this is correct | |
) | |