Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
| 39 |
os.system('pip install backoff')
|
| 40 |
-
|
| 41 |
# Global constants and helper functions
|
| 42 |
|
| 43 |
MAX_SEED = np.iinfo(np.int32).max
|
|
@@ -259,16 +258,7 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
|
|
| 259 |
# ------------------------------------------------------------------------------
|
| 260 |
|
| 261 |
DESCRIPTION = """
|
| 262 |
-
# Agent Dino 🌠
|
| 263 |
-
This chatbot supports various commands:
|
| 264 |
-
- **@tts1 / @tts2:** text-to-speech
|
| 265 |
-
- **@image:** image generation
|
| 266 |
-
- **@3d:** 3D mesh generation
|
| 267 |
-
- **@web:** web search/visit
|
| 268 |
-
- **@rAgent:** reasoning chain
|
| 269 |
-
- **@yolo:** object detection
|
| 270 |
-
- **@phi4:** multimodal (image/audio) question answering
|
| 271 |
-
"""
|
| 272 |
|
| 273 |
css = '''
|
| 274 |
h1 {
|
|
@@ -582,14 +572,15 @@ def generate(
|
|
| 582 |
if not question:
|
| 583 |
yield "Error: Please provide a question after @phi4."
|
| 584 |
return
|
| 585 |
-
|
| 586 |
# Determine input type (Image or Audio) from the first file
|
| 587 |
input_file = files[0]
|
| 588 |
try:
|
|
|
|
| 589 |
if isinstance(input_file, Image.Image):
|
| 590 |
input_type = "Image"
|
| 591 |
file_for_phi4 = input_file
|
| 592 |
else:
|
|
|
|
| 593 |
try:
|
| 594 |
file_for_phi4 = Image.open(input_file)
|
| 595 |
input_type = "Image"
|
|
@@ -599,7 +590,7 @@ def generate(
|
|
| 599 |
except Exception:
|
| 600 |
input_type = "Audio"
|
| 601 |
file_for_phi4 = input_file
|
| 602 |
-
|
| 603 |
if input_type == "Image":
|
| 604 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
| 605 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
|
@@ -610,22 +601,20 @@ def generate(
|
|
| 610 |
else:
|
| 611 |
yield "Invalid file type for @phi4 multimodal processing."
|
| 612 |
return
|
| 613 |
-
|
| 614 |
with torch.no_grad():
|
| 615 |
generate_ids = phi4_model.generate(
|
| 616 |
**inputs,
|
| 617 |
max_new_tokens=200,
|
| 618 |
num_logits_to_keep=0,
|
| 619 |
-
streamer=streamer # Adding text streamer
|
| 620 |
)
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
|
| 630 |
# --- Text and TTS branch ---
|
| 631 |
tts_prefix = "@tts"
|
|
|
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
| 39 |
os.system('pip install backoff')
|
|
|
|
| 40 |
# Global constants and helper functions
|
| 41 |
|
| 42 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
|
| 258 |
# ------------------------------------------------------------------------------
|
| 259 |
|
| 260 |
DESCRIPTION = """
|
| 261 |
+
# Agent Dino 🌠"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
css = '''
|
| 264 |
h1 {
|
|
|
|
| 572 |
if not question:
|
| 573 |
yield "Error: Please provide a question after @phi4."
|
| 574 |
return
|
|
|
|
| 575 |
# Determine input type (Image or Audio) from the first file
|
| 576 |
input_file = files[0]
|
| 577 |
try:
|
| 578 |
+
# If file is already a PIL Image, treat as image
|
| 579 |
if isinstance(input_file, Image.Image):
|
| 580 |
input_type = "Image"
|
| 581 |
file_for_phi4 = input_file
|
| 582 |
else:
|
| 583 |
+
# Try opening as image; if it fails, assume audio
|
| 584 |
try:
|
| 585 |
file_for_phi4 = Image.open(input_file)
|
| 586 |
input_type = "Image"
|
|
|
|
| 590 |
except Exception:
|
| 591 |
input_type = "Audio"
|
| 592 |
file_for_phi4 = input_file
|
| 593 |
+
|
| 594 |
if input_type == "Image":
|
| 595 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
| 596 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
|
|
|
| 601 |
else:
|
| 602 |
yield "Invalid file type for @phi4 multimodal processing."
|
| 603 |
return
|
| 604 |
+
|
| 605 |
with torch.no_grad():
|
| 606 |
generate_ids = phi4_model.generate(
|
| 607 |
**inputs,
|
| 608 |
max_new_tokens=200,
|
| 609 |
num_logits_to_keep=0,
|
|
|
|
| 610 |
)
|
| 611 |
+
input_length = inputs['input_ids'].shape[1]
|
| 612 |
+
generate_ids = generate_ids[:, input_length:]
|
| 613 |
+
response = phi4_processor.batch_decode(
|
| 614 |
+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 615 |
+
)[0]
|
| 616 |
+
yield response
|
| 617 |
+
return
|
|
|
|
| 618 |
|
| 619 |
# --- Text and TTS branch ---
|
| 620 |
tts_prefix = "@tts"
|