Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -4,14 +4,11 @@ import torch
|
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
5 |
import cv2
|
6 |
import numpy as np
|
7 |
-
import io
|
8 |
|
9 |
|
10 |
# # Ensure GPU usage if available
|
11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
|
13 |
-
|
14 |
-
|
15 |
# Initialize the model and tokenizer
|
16 |
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
17 |
torch_dtype=torch.float16,
|
@@ -20,59 +17,54 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
|
20 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
21 |
|
22 |
|
23 |
-
|
24 |
-
# """Extracts frames from the video, 1 per second."""
|
25 |
-
# video = cv2.VideoCapture(io.BytesIO(video_bytes))
|
26 |
-
# fps = video.get(cv2.CAP_PROP_FPS)
|
27 |
-
# frames = []
|
28 |
-
# success, frame = video.read()
|
29 |
-
# while success:
|
30 |
-
# frames.append(frame)
|
31 |
-
# for _ in range(int(fps)): # Skip fps frames
|
32 |
-
# success, frame = video.read()
|
33 |
-
# video.release()
|
34 |
-
# return frames[:4] # Return the first 4 frames
|
35 |
-
|
36 |
-
def video_to_frames(video_path):
|
37 |
"""Converts a video file into frames and stores them as PNG images in a list."""
|
38 |
-
# List to hold frames encoded as PNG
|
39 |
frames_png = []
|
|
|
40 |
|
41 |
-
# Open the video file
|
42 |
-
cap = cv2.VideoCapture(video_path)
|
43 |
-
|
44 |
-
# Check if video opened successfully
|
45 |
if not cap.isOpened():
|
46 |
print("Error opening video file")
|
47 |
return frames_png
|
48 |
|
49 |
-
|
|
|
|
|
50 |
while cap.isOpened():
|
51 |
-
# Capture frame-by-frame
|
52 |
ret, frame = cap.read()
|
53 |
-
|
54 |
-
# If frame is read correctly ret is True
|
55 |
if not ret:
|
56 |
print("Can't receive frame (stream end?). Exiting ...")
|
57 |
break
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
|
64 |
-
# When everything done, release the video capture object
|
65 |
cap.release()
|
66 |
-
|
67 |
return frames_png
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def predict_answer(image, video, question, max_tokens=100):
|
70 |
|
71 |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
|
72 |
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
|
73 |
|
74 |
|
75 |
-
if image:
|
76 |
# Process as an image
|
77 |
image = image.convert("RGB")
|
78 |
image_tensor = model.image_preprocess(image)
|
@@ -86,13 +78,13 @@ def predict_answer(image, video, question, max_tokens=100):
|
|
86 |
|
87 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
88 |
|
89 |
-
elif video:
|
90 |
# Process as a video
|
91 |
frames = video_to_frames(video)
|
92 |
answers = []
|
93 |
for frame in frames:
|
94 |
-
|
95 |
-
image_tensor = model.image_preprocess(
|
96 |
|
97 |
# Generate the answer
|
98 |
output_ids = model.generate(
|
@@ -114,21 +106,17 @@ def predict_answer(image, video, question, max_tokens=100):
|
|
114 |
def gradio_predict(image, video, question, max_tokens):
|
115 |
answer = predict_answer(image, video, question, max_tokens)
|
116 |
return answer
|
117 |
-
|
118 |
-
|
119 |
|
120 |
-
# Define the Gradio interface
|
121 |
iface = gr.Interface(
|
122 |
fn=gradio_predict,
|
123 |
-
inputs=[
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
outputs=gr.TextArea(label="Answer"),
|
128 |
-
#
|
129 |
-
title="
|
130 |
-
|
131 |
)
|
132 |
|
133 |
-
|
134 |
-
iface.queue().launch(debug=True)
|
|
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
5 |
import cv2
|
6 |
import numpy as np
|
|
|
7 |
|
8 |
|
9 |
# # Ensure GPU usage if available
|
10 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
11 |
|
|
|
|
|
12 |
# Initialize the model and tokenizer
|
13 |
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
14 |
torch_dtype=torch.float16,
|
|
|
17 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
18 |
|
19 |
|
20 |
+
def video_to_frames(video, fps=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"""Converts a video file into frames and stores them as PNG images in a list."""
|
|
|
22 |
frames_png = []
|
23 |
+
cap = cv2.VideoCapture(video)
|
24 |
|
|
|
|
|
|
|
|
|
25 |
if not cap.isOpened():
|
26 |
print("Error opening video file")
|
27 |
return frames_png
|
28 |
|
29 |
+
frame_count = 0
|
30 |
+
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval
|
31 |
+
|
32 |
while cap.isOpened():
|
|
|
33 |
ret, frame = cap.read()
|
|
|
|
|
34 |
if not ret:
|
35 |
print("Can't receive frame (stream end?). Exiting ...")
|
36 |
break
|
37 |
|
38 |
+
if frame_count % frame_interval == 0:
|
39 |
+
is_success, buffer = cv2.imencode(".png", frame)
|
40 |
+
if is_success:
|
41 |
+
frames_png.append(np.array(buffer).tobytes())
|
42 |
+
|
43 |
+
frame_count += 1
|
44 |
|
|
|
45 |
cap.release()
|
|
|
46 |
return frames_png
|
47 |
|
48 |
+
def extract_frames(frame):
|
49 |
+
|
50 |
+
# Convert binary data to a numpy array
|
51 |
+
frame_np = np.frombuffer(frame, dtype=np.uint8)
|
52 |
+
|
53 |
+
# Decode the PNG image
|
54 |
+
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format
|
55 |
+
|
56 |
+
# Convert RGB to BGR
|
57 |
+
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
|
58 |
+
|
59 |
+
return image_bgr
|
60 |
+
|
61 |
def predict_answer(image, video, question, max_tokens=100):
|
62 |
|
63 |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
|
64 |
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
|
65 |
|
66 |
|
67 |
+
if image is not None:
|
68 |
# Process as an image
|
69 |
image = image.convert("RGB")
|
70 |
image_tensor = model.image_preprocess(image)
|
|
|
78 |
|
79 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
80 |
|
81 |
+
elif video is not None:
|
82 |
# Process as a video
|
83 |
frames = video_to_frames(video)
|
84 |
answers = []
|
85 |
for frame in frames:
|
86 |
+
image = extract_frames(frame)
|
87 |
+
image_tensor = model.image_preprocess(image)
|
88 |
|
89 |
# Generate the answer
|
90 |
output_ids = model.generate(
|
|
|
106 |
def gradio_predict(image, video, question, max_tokens):
|
107 |
answer = predict_answer(image, video, question, max_tokens)
|
108 |
return answer
|
|
|
|
|
109 |
|
|
|
110 |
iface = gr.Interface(
|
111 |
fn=gradio_predict,
|
112 |
+
inputs=[
|
113 |
+
gr.Image(type="pil", label="Upload or Drag an Image"),
|
114 |
+
gr.Video(label="Upload your video here"),
|
115 |
+
],
|
116 |
outputs=gr.TextArea(label="Answer"),
|
117 |
+
# outputs=gr.Image(label="Output"),
|
118 |
+
title="Video/Image Viewer",
|
119 |
+
description="Upload an image or video to view it or extract frames from the video.",
|
120 |
)
|
121 |
|
122 |
+
iface.launch(debug=True)
|
|