prithivMLmods commited on
Commit
b40230a
·
verified ·
1 Parent(s): 45691d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -102
app.py CHANGED
@@ -37,6 +37,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
37
  MODEL_ID_M = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
38
  processor_m = AutoImageProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
39
  tokenizer_m = AutoTokenizer.from_pretrained(MODEL_ID_M)
 
40
  model_m = AutoModel.from_pretrained(
41
  MODEL_ID_M,
42
  trust_remote_code=True,
@@ -89,35 +90,65 @@ def generate_image(model_name: str, text: str, image: Image.Image,
89
  processor = processor_m
90
  tokenizer = tokenizer_m
91
  model = model_m
92
- elif model_name == "SpaceThinker-3B":
93
- processor = processor_z
94
- model = model_z
95
- elif model_name == "coreOCR-7B-050325-preview":
96
- processor = processor_k
97
- model = model_k
98
- else:
99
- yield "Invalid model selected."
100
- return
101
-
102
- if image is None:
103
- yield "Please upload an image."
104
- return
105
-
106
- # For Llama-3.1-Nemotron-Nano-VL-8B-V1, manually construct prompt and tokenize
107
- if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
108
- # Construct a simple prompt since apply_chat_template is not available
109
- prompt_full = f"<|image|>{text}<|endoftext|>"
110
- inputs = tokenizer(
111
- prompt_full,
112
- return_tensors="pt",
113
- padding=True,
114
- truncation=False,
115
- max_length=MAX_INPUT_TOKEN_LENGTH
116
- ).to(device)
117
- # Process image separately
118
- image_inputs = processor(image, return_tensors="pt").to(device)
119
- inputs.update(image_inputs)
120
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  messages = [{
122
  "role": "user",
123
  "content": [
@@ -134,21 +165,19 @@ def generate_image(model_name: str, text: str, image: Image.Image,
134
  truncation=False,
135
  max_length=MAX_INPUT_TOKEN_LENGTH
136
  ).to(device)
137
-
138
- streamer = TextIteratorStreamer(
139
- tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
140
- skip_prompt=True,
141
- skip_special_tokens=True
142
- )
143
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
144
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
145
- thread.start()
146
- buffer = ""
147
- for new_text in streamer:
148
- buffer += new_text
149
- buffer = buffer.replace("<|im_end|>", "")
150
- time.sleep(0.01)
151
- yield buffer
152
 
153
  @spaces.GPU
154
  def generate_video(model_name: str, text: str, video_path: str,
@@ -161,39 +190,65 @@ def generate_video(model_name: str, text: str, video_path: str,
161
  processor = processor_m
162
  tokenizer = tokenizer_m
163
  model = model_m
164
- elif model_name == "SpaceThinker-3B":
165
- processor = processor_z
166
- model = model_z
167
- elif model_name == "coreOCR-7B-050325-preview":
168
- processor = processor_k
169
- model = model_k
170
- else:
171
- yield "Invalid model selected."
172
- return
173
-
174
- if video_path is None:
175
- yield "Please upload a video."
176
- return
177
-
178
- frames = downsample_video(video_path)
179
- if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
180
- # Construct a simple prompt for Llama-3.1-Nemotron-Nano-VL-8B-V1
181
- prompt_parts = ["<|startoftext|>You are a helpful assistant.<|endoftext|>", text]
182
- for frame in frames:
183
- image, timestamp = frame
184
- prompt_parts.append(f"Frame {timestamp}: <|image|>")
185
- prompt_full = " ".join(prompt_parts) + "<|endoftext|>"
186
- inputs = tokenizer(
187
- prompt_full,
188
- return_tensors="pt",
189
- padding=True,
190
- truncation=False,
191
- max_length=MAX_INPUT_TOKEN_LENGTH
192
- ).to(device)
193
  # Process all frames
194
- image_inputs = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
195
- inputs.update(image_inputs)
196
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  messages = [
198
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
199
  {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -208,33 +263,33 @@ def generate_video(model_name: str, text: str, video_path: str,
208
  add_generation_prompt=True,
209
  return_dict=True,
210
  return_tensors="pt",
 
 
211
  truncation=False,
212
  max_length=MAX_INPUT_TOKEN_LENGTH
213
  ).to(device)
214
-
215
- streamer = TextIteratorStreamer(
216
- tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
217
- skip_prompt=True,
218
- skip_special_tokens=True
219
- )
220
- generation_kwargs = {
221
- **inputs,
222
- "streamer": streamer,
223
- "max_new_tokens": max_new_tokens,
224
- "do_sample": True,
225
- "temperature": temperature,
226
- "top_p": top_p,
227
- "top_k": top_k,
228
- "repetition_penalty": repetition_penalty,
229
- }
230
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
231
- thread.start()
232
- buffer = ""
233
- for new_text in streamer:
234
- buffer += new_text
235
- buffer = buffer.replace("<|im_end|>", "")
236
- time.sleep(0.01)
237
- yield buffer
238
 
239
  # Define examples for image and video inference
240
  image_examples = [
@@ -293,11 +348,11 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
293
  model_choice = gr.Radio(
294
  choices=["Llama-3.1-Nemotron-Nano-VL-8B-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview"],
295
  label="Select Model",
296
- value="SkyCaptioner-V1"
297
  )
298
 
299
  gr.Markdown("**Model Info**")
300
- gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
301
  gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
302
  gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
303
  gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")
 
37
  MODEL_ID_M = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
38
  processor_m = AutoImageProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
39
  tokenizer_m = AutoTokenizer.from_pretrained(MODEL_ID_M)
40
+ tokenizer_m.pad_token = tokenizer_m.eos_token # Set pad_token to resolve ValueError
41
  model_m = AutoModel.from_pretrained(
42
  MODEL_ID_M,
43
  trust_remote_code=True,
 
90
  processor = processor_m
91
  tokenizer = tokenizer_m
92
  model = model_m
93
+ if image is None:
94
+ yield "Please upload an image."
95
+ return
96
+ # Construct message with <image> token as per reference
97
+ if "<image>" not in text:
98
+ message = f"<image>\n{text}"
99
+ else:
100
+ message = text
101
+
102
+ # Tokenize the message
103
+ inputs = tokenizer(message, return_tensors="pt").to(device)
104
+
105
+ # Process image
106
+ image_features = processor(image, return_tensors="pt").to(device)
107
+
108
+ # Combine inputs
109
+ generation_inputs = {
110
+ "input_ids": inputs["input_ids"],
111
+ "attention_mask": inputs["attention_mask"],
112
+ **image_features,
113
+ }
114
+
115
+ # Create streamer
116
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
117
+
118
+ # Generation kwargs
119
+ generation_kwargs = {
120
+ **generation_inputs,
121
+ "streamer": streamer,
122
+ "max_new_tokens": max_new_tokens,
123
+ "do_sample": True,
124
+ "temperature": temperature,
125
+ "top_p": top_p,
126
+ "top_k": top_k,
127
+ "repetition_penalty": repetition_penalty,
128
+ }
129
+
130
+ # Start generation in a thread
131
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
132
+ thread.start()
133
+
134
+ buffer = ""
135
+ for new_text in streamer:
136
+ buffer += new_text
137
+ buffer = buffer.replace("<|im_end|>", "")
138
+ time.sleep(0.01)
139
+ yield buffer
140
+ elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
141
+ if model_name == "SpaceThinker-3B":
142
+ processor = processor_z
143
+ model = model_z
144
+ else:
145
+ processor = processor_k
146
+ model = model_k
147
+
148
+ if image is None:
149
+ yield "Please upload an image."
150
+ return
151
+
152
  messages = [{
153
  "role": "user",
154
  "content": [
 
165
  truncation=False,
166
  max_length=MAX_INPUT_TOKEN_LENGTH
167
  ).to(device)
168
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
169
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
170
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
171
+ thread.start()
172
+ buffer = ""
173
+ for new_text in streamer:
174
+ buffer += new_text
175
+ buffer = buffer.replace("<|im_end|>", "")
176
+ time.sleep(0.01)
177
+ yield buffer
178
+ else:
179
+ yield "Invalid model selected."
180
+ return
 
 
181
 
182
  @spaces.GPU
183
  def generate_video(model_name: str, text: str, video_path: str,
 
190
  processor = processor_m
191
  tokenizer = tokenizer_m
192
  model = model_m
193
+ if video_path is None:
194
+ yield "Please upload a video."
195
+ return
196
+ frames = downsample_video(video_path)
197
+ # Construct message with multiple <image> tokens
198
+ prompt_parts = ["<image>"] * len(frames) + [text]
199
+ message = " ".join(prompt_parts)
200
+
201
+ # Tokenize
202
+ inputs = tokenizer(message, return_tensors="pt").to(device)
203
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  # Process all frames
205
+ image_features = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
206
+
207
+ # Combine inputs
208
+ generation_inputs = {
209
+ "input_ids": inputs["input_ids"],
210
+ "attention_mask": inputs["attention_mask"],
211
+ **image_features,
212
+ }
213
+
214
+ # Create streamer
215
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
216
+
217
+ # Generation kwargs
218
+ generation_kwargs = {
219
+ **generation_inputs,
220
+ "streamer": streamer,
221
+ "max_new_tokens": max_new_tokens,
222
+ "do_sample": True,
223
+ "temperature": temperature,
224
+ "top_p": top_p,
225
+ "top_k": top_k,
226
+ "repetition_penalty": repetition_penalty,
227
+ }
228
+
229
+ # Start generation in a thread
230
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
231
+ thread.start()
232
+
233
+ buffer = ""
234
+ for new_text in streamer:
235
+ buffer += new_text
236
+ buffer = buffer.replace("<|im_end|>", "")
237
+ time.sleep(0.01)
238
+ yield buffer
239
+ elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
240
+ if model_name == "SpaceThinker-3B":
241
+ processor = processor_z
242
+ model = model_z
243
+ else:
244
+ processor = processor_k
245
+ model = model_k
246
+
247
+ if video_path is None:
248
+ yield "Please upload a video."
249
+ return
250
+
251
+ frames = downsample_video(video_path)
252
  messages = [
253
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
254
  {"role": "user", "content": [{"type": "text", "text": text}]}
 
263
  add_generation_prompt=True,
264
  return_dict=True,
265
  return_tensors="pt",
266
+ ilibre
267
+
268
  truncation=False,
269
  max_length=MAX_INPUT_TOKEN_LENGTH
270
  ).to(device)
271
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
272
+ generation_kwargs = {
273
+ **inputs,
274
+ "streamer": streamer,
275
+ "max_new_tokens": max_new_tokens,
276
+ "do_sample": True,
277
+ "temperature": temperature,
278
+ "top_p": top_p,
279
+ "top_k": top_k,
280
+ "repetition_penalty": repetition_penalty,
281
+ }
282
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
283
+ thread.start()
284
+ buffer = ""
285
+ for new_text in streamer:
286
+ buffer += new_text
287
+ buffer = buffer.replace("<|im_end|>", "")
288
+ time.sleep(0.01)
289
+ yield buffer
290
+ else:
291
+ yield "Invalid model selected."
292
+ return
 
 
293
 
294
  # Define examples for image and video inference
295
  image_examples = [
 
348
  model_choice = gr.Radio(
349
  choices=["Llama-3.1-Nemotron-Nano-VL-8B-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview"],
350
  label="Select Model",
351
+ value="Llama-3.1-Nemotron-Nano-VL-8B-V1" # Updated default value to a valid choice
352
  )
353
 
354
  gr.Markdown("**Model Info**")
355
+ gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
356
  gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
357
  gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
358
  gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")