NeuS-V / neus_v /vlm /vllm_client.py
Syzygianinfern0's picture
Update everything from iccv submission
8a7a9dd
import base64
import cv2
import numpy as np
from openai import OpenAI
from neus_v.calibration.vlm import smooth_mapping
from neus_v.vlm.obj import DetectedObject
class VLLMClient:
def __init__(
self,
api_key="EMPTY",
api_base="http://localhost:8000/v1",
model="OpenGVLab/InternVL2_5-8B",
):
self.client = OpenAI(api_key=api_key, base_url=api_base)
self.model = model
self.system_message = (
"You must only return a Yes or No, and not both, to any question asked.\n"
"You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n"
"For example, if the question is 'Is there a cat present in the Image?', the answer must only be 'Yes' or 'No'."
)
# def _encode_frame(self, frame):
# return base64.b64encode(frame.tobytes()).decode("utf-8")
def _encode_frame(self, frame):
# Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
ret, buffer = cv2.imencode(".jpg", frame)
if not ret:
raise ValueError("Could not encode frame")
return base64.b64encode(buffer).decode("utf-8")
def detect(self, seq_of_frames, scene_description, threshold=None, **kwargs):
# Encode each frame.
encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]
# Build the user message: a text prompt plus one image for each frame.
user_content = [
{
"type": "text",
"text": f"Does the sequence of these images depict '{scene_description}'",
}
]
for encoded in encoded_images:
user_content.append(
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
}
)
# Create a chat completion request.
chat_response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": self.system_message},
{"role": "user", "content": user_content},
],
max_tokens=1,
temperature=0.0,
logprobs=True,
top_logprobs=20,
)
# Retrieve the list of TopLogprob objects.
top_logprobs_list = chat_response.choices[0].logprobs.content[0].top_logprobs
# Build a mapping from token text (stripped) to its probability.
token_prob_map = {}
for top_logprob in top_logprobs_list:
token_text = top_logprob.token.strip()
token_prob_map[token_text] = np.exp(top_logprob.logprob)
# Extract probabilities for "Yes" and "No"
yes_prob = token_prob_map.get("Yes", 0.0)
no_prob = token_prob_map.get("No", 0.0)
# Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
if yes_prob + no_prob > 0:
confidence = yes_prob / (yes_prob + no_prob)
else:
raise ValueError("No probabilities for 'Yes' or 'No' found in the response.")
if threshold:
confidence = smooth_mapping(confidence=confidence, false_threshold=threshold)
if confidence < threshold:
no_prob = 1.0
return DetectedObject(
name=scene_description,
model_name=self.model,
confidence=round(confidence, 3),
probability=round(confidence, 3),
number_of_detection=1,
is_detected=yes_prob > no_prob, # TODO: Check if this is correct
)