prithivMLmods commited on
Commit
45691d2
·
verified ·
1 Parent(s): b1936b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -55
app.py CHANGED
@@ -62,10 +62,6 @@ model_k = Qwen2VLForConditionalGeneration.from_pretrained(
62
  ).to(device).eval()
63
 
64
  def downsample_video(video_path):
65
- """
66
- Downsamples the video to evenly spaced frames.
67
- Each frame is returned as a PIL image along with its timestamp.
68
- """
69
  vidcap = cv2.VideoCapture(video_path)
70
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
71
  fps = vidcap.get(cv2.CAP_PROP_FPS)
@@ -84,16 +80,14 @@ def downsample_video(video_path):
84
 
85
  @spaces.GPU
86
  def generate_image(model_name: str, text: str, image: Image.Image,
87
- max_new_tokens: int = 1024,
88
- temperature: float = 0.6,
89
- top_p: float = 0.9,
90
- top_k: int = 50,
91
- repetition_penalty: float = 1.2):
92
- """
93
- Generates responses using the selected model for image input.
94
- """
95
  if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
96
  processor = processor_m
 
97
  model = model_m
98
  elif model_name == "SpaceThinker-3B":
99
  processor = processor_z
@@ -109,23 +103,43 @@ def generate_image(model_name: str, text: str, image: Image.Image,
109
  yield "Please upload an image."
110
  return
111
 
112
- messages = [{
113
- "role": "user",
114
- "content": [
115
- {"type": "image", "image": image},
116
- {"type": "text", "text": text},
117
- ]
118
- }]
119
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
120
- inputs = processor(
121
- text=[prompt_full],
122
- images=[image],
123
- return_tensors="pt",
124
- padding=True,
125
- truncation=False,
126
- max_length=MAX_INPUT_TOKEN_LENGTH
127
- ).to(device)
128
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
130
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
131
  thread.start()
@@ -138,16 +152,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
138
 
139
  @spaces.GPU
140
  def generate_video(model_name: str, text: str, video_path: str,
141
- max_new_tokens: int = 1024,
142
- temperature: float = 0.6,
143
- top_p: float = 0.9,
144
- top_k: int = 50,
145
- repetition_penalty: float = 1.2):
146
- """
147
- Generates responses using the selected model for video input.
148
- """
149
  if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
150
  processor = processor_m
 
151
  model = model_m
152
  elif model_name == "SpaceThinker-3B":
153
  processor = processor_z
@@ -164,24 +176,47 @@ def generate_video(model_name: str, text: str, video_path: str,
164
  return
165
 
166
  frames = downsample_video(video_path)
167
- messages = [
168
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
169
- {"role": "user", "content": [{"type": "text", "text": text}]}
170
- ]
171
- for frame in frames:
172
- image, timestamp = frame
173
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
174
- messages[1]["content"].append({"type": "image", "image": image})
175
- inputs = processor.apply_chat_template(
176
- messages,
177
- tokenize=True,
178
- add_generation_prompt=True,
179
- return_dict=True,
180
- return_tensors="pt",
181
- truncation=False,
182
- max_length=MAX_INPUT_TOKEN_LENGTH
183
- ).to(device)
184
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  generation_kwargs = {
186
  **inputs,
187
  "streamer": streamer,
 
62
  ).to(device).eval()
63
 
64
  def downsample_video(video_path):
 
 
 
 
65
  vidcap = cv2.VideoCapture(video_path)
66
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
67
  fps = vidcap.get(cv2.CAP_PROP_FPS)
 
80
 
81
  @spaces.GPU
82
  def generate_image(model_name: str, text: str, image: Image.Image,
83
+ max_new_tokens: int = 1024,
84
+ temperature: float = 0.6,
85
+ top_p: float = 0.9,
86
+ top_k: int = 50,
87
+ repetition_penalty: float = 1.2):
 
 
 
88
  if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
89
  processor = processor_m
90
+ tokenizer = tokenizer_m
91
  model = model_m
92
  elif model_name == "SpaceThinker-3B":
93
  processor = processor_z
 
103
  yield "Please upload an image."
104
  return
105
 
106
+ # For Llama-3.1-Nemotron-Nano-VL-8B-V1, manually construct prompt and tokenize
107
+ if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
108
+ # Construct a simple prompt since apply_chat_template is not available
109
+ prompt_full = f"<|image|>{text}<|endoftext|>"
110
+ inputs = tokenizer(
111
+ prompt_full,
112
+ return_tensors="pt",
113
+ padding=True,
114
+ truncation=False,
115
+ max_length=MAX_INPUT_TOKEN_LENGTH
116
+ ).to(device)
117
+ # Process image separately
118
+ image_inputs = processor(image, return_tensors="pt").to(device)
119
+ inputs.update(image_inputs)
120
+ else:
121
+ messages = [{
122
+ "role": "user",
123
+ "content": [
124
+ {"type": "image", "image": image},
125
+ {"type": "text", "text": text},
126
+ ]
127
+ }]
128
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
129
+ inputs = processor(
130
+ text=[prompt_full],
131
+ images=[image],
132
+ return_tensors="pt",
133
+ padding=True,
134
+ truncation=False,
135
+ max_length=MAX_INPUT_TOKEN_LENGTH
136
+ ).to(device)
137
+
138
+ streamer = TextIteratorStreamer(
139
+ tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
140
+ skip_prompt=True,
141
+ skip_special_tokens=True
142
+ )
143
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
144
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
145
  thread.start()
 
152
 
153
  @spaces.GPU
154
  def generate_video(model_name: str, text: str, video_path: str,
155
+ max_new_tokens: int = 1024,
156
+ temperature: float = 0.6,
157
+ top_p: float = 0.9,
158
+ top_k: int = 50,
159
+ repetition_penalty: float = 1.2):
 
 
 
160
  if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
161
  processor = processor_m
162
+ tokenizer = tokenizer_m
163
  model = model_m
164
  elif model_name == "SpaceThinker-3B":
165
  processor = processor_z
 
176
  return
177
 
178
  frames = downsample_video(video_path)
179
+ if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
180
+ # Construct a simple prompt for Llama-3.1-Nemotron-Nano-VL-8B-V1
181
+ prompt_parts = ["<|startoftext|>You are a helpful assistant.<|endoftext|>", text]
182
+ for frame in frames:
183
+ image, timestamp = frame
184
+ prompt_parts.append(f"Frame {timestamp}: <|image|>")
185
+ prompt_full = " ".join(prompt_parts) + "<|endoftext|>"
186
+ inputs = tokenizer(
187
+ prompt_full,
188
+ return_tensors="pt",
189
+ padding=True,
190
+ truncation=False,
191
+ max_length=MAX_INPUT_TOKEN_LENGTH
192
+ ).to(device)
193
+ # Process all frames
194
+ image_inputs = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
195
+ inputs.update(image_inputs)
196
+ else:
197
+ messages = [
198
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
199
+ {"role": "user", "content": [{"type": "text", "text": text}]}
200
+ ]
201
+ for frame in frames:
202
+ image, timestamp = frame
203
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
204
+ messages[1]["content"].append({"type": "image", "image": image})
205
+ inputs = processor.apply_chat_template(
206
+ messages,
207
+ tokenize=True,
208
+ add_generation_prompt=True,
209
+ return_dict=True,
210
+ return_tensors="pt",
211
+ truncation=False,
212
+ max_length=MAX_INPUT_TOKEN_LENGTH
213
+ ).to(device)
214
+
215
+ streamer = TextIteratorStreamer(
216
+ tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
217
+ skip_prompt=True,
218
+ skip_special_tokens=True
219
+ )
220
  generation_kwargs = {
221
  **inputs,
222
  "streamer": streamer,