Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -35,6 +35,9 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
|
| 35 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
| 36 |
from diffusers.utils import export_to_ply
|
| 37 |
|
|
|
|
|
|
|
|
|
|
| 38 |
# Global constants and helper functions
|
| 39 |
|
| 40 |
MAX_SEED = np.iinfo(np.int32).max
|
|
@@ -424,7 +427,60 @@ def detect_objects(image: np.ndarray):
|
|
| 424 |
|
| 425 |
return Image.fromarray(annotated_image)
|
| 426 |
|
| 427 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
@spaces.GPU
|
| 430 |
def generate(
|
|
@@ -442,8 +498,9 @@ def generate(
|
|
| 442 |
- "@image": triggers image generation using the SDXL pipeline.
|
| 443 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
| 444 |
- "@web": triggers a web search or webpage visit.
|
| 445 |
-
- "@
|
| 446 |
- "@yolo": triggers object detection using YOLO.
|
|
|
|
| 447 |
"""
|
| 448 |
text = input_dict["text"]
|
| 449 |
files = input_dict.get("files", [])
|
|
@@ -539,6 +596,24 @@ def generate(
|
|
| 539 |
yield gr.Image(result_img)
|
| 540 |
return
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
# --- Text and TTS branch ---
|
| 543 |
tts_prefix = "@tts"
|
| 544 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
@@ -627,12 +702,14 @@ demo = gr.ChatInterface(
|
|
| 627 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
| 628 |
],
|
| 629 |
examples=[
|
|
|
|
|
|
|
| 630 |
["@tts2 What causes rainbows to form?"],
|
| 631 |
["@image Chocolate dripping from a donut"],
|
| 632 |
["@3d A birthday cupcake with cherry"],
|
| 633 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
| 634 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
| 635 |
-
["@
|
| 636 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 637 |
["@tts1 Explain Tower of Hanoi"],
|
| 638 |
],
|
|
@@ -641,7 +718,7 @@ demo = gr.ChatInterface(
|
|
| 641 |
description=DESCRIPTION,
|
| 642 |
css=css,
|
| 643 |
fill_height=True,
|
| 644 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1
|
| 645 |
stop_btn="Stop Generation",
|
| 646 |
multimodal=True,
|
| 647 |
)
|
|
|
|
| 35 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
| 36 |
from diffusers.utils import export_to_ply
|
| 37 |
|
| 38 |
+
# Additional import for Phi-4 multimodality (audio support)
|
| 39 |
+
import soundfile as sf
|
| 40 |
+
|
| 41 |
# Global constants and helper functions
|
| 42 |
|
| 43 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
|
| 427 |
|
| 428 |
return Image.fromarray(annotated_image)
|
| 429 |
|
| 430 |
+
# ---------------------------
|
| 431 |
+
# Phi-4 Multimodal Model Setup with Text Streaming
|
| 432 |
+
# ---------------------------
|
| 433 |
+
phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
|
| 434 |
+
|
| 435 |
+
phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
|
| 436 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
| 437 |
+
phi4_model_path,
|
| 438 |
+
device_map="auto",
|
| 439 |
+
torch_dtype="auto",
|
| 440 |
+
trust_remote_code=True,
|
| 441 |
+
_attn_implementation="eager",
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200):
|
| 445 |
+
"""
|
| 446 |
+
Process an image or audio input with the Phi-4 multimodal model.
|
| 447 |
+
Uses a text streamer to yield incremental outputs.
|
| 448 |
+
Expects input_type to be either 'image' or 'audio'.
|
| 449 |
+
"""
|
| 450 |
+
user_prompt = '<|user|>'
|
| 451 |
+
assistant_prompt = '<|assistant|>'
|
| 452 |
+
prompt_suffix = '<|end|>'
|
| 453 |
+
|
| 454 |
+
if not file or not question:
|
| 455 |
+
yield "Please upload a file and provide a question."
|
| 456 |
+
return
|
| 457 |
+
|
| 458 |
+
if input_type.lower() == "image":
|
| 459 |
+
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
|
| 460 |
+
image = Image.open(file)
|
| 461 |
+
inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
|
| 462 |
+
elif input_type.lower() == "audio":
|
| 463 |
+
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
|
| 464 |
+
audio, samplerate = sf.read(file)
|
| 465 |
+
inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
|
| 466 |
+
else:
|
| 467 |
+
yield "Invalid input type selected."
|
| 468 |
+
return
|
| 469 |
+
|
| 470 |
+
# Setup text streamer using TextIteratorStreamer for incremental generation
|
| 471 |
+
streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
|
| 472 |
+
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 473 |
+
thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
|
| 474 |
+
thread.start()
|
| 475 |
+
buffer = ""
|
| 476 |
+
yield "🤔 Thinking..."
|
| 477 |
+
for new_text in streamer:
|
| 478 |
+
buffer += new_text
|
| 479 |
+
buffer = buffer.replace("<|im_end|>", "")
|
| 480 |
+
time.sleep(0.01)
|
| 481 |
+
yield buffer
|
| 482 |
+
|
| 483 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @ragent, @yolo, and now @phi4 commands
|
| 484 |
|
| 485 |
@spaces.GPU
|
| 486 |
def generate(
|
|
|
|
| 498 |
- "@image": triggers image generation using the SDXL pipeline.
|
| 499 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
| 500 |
- "@web": triggers a web search or webpage visit.
|
| 501 |
+
- "@ragent": initiates a reasoning chain using Llama mode.
|
| 502 |
- "@yolo": triggers object detection using YOLO.
|
| 503 |
+
- **New:** "@phi4": processes image or audio inputs with the Phi-4 multimodal model and streams text output.
|
| 504 |
"""
|
| 505 |
text = input_dict["text"]
|
| 506 |
files = input_dict.get("files", [])
|
|
|
|
| 596 |
yield gr.Image(result_img)
|
| 597 |
return
|
| 598 |
|
| 599 |
+
# --- Phi-4 Multimodal branch with text streaming ---
|
| 600 |
+
if text.strip().lower().startswith("@phi4"):
|
| 601 |
+
# Expected format: "@phi4 [image|audio] <your question>"
|
| 602 |
+
parts = text.strip().split(maxsplit=2)
|
| 603 |
+
if len(parts) < 3:
|
| 604 |
+
yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
|
| 605 |
+
return
|
| 606 |
+
input_type = parts[1]
|
| 607 |
+
question = parts[2]
|
| 608 |
+
if not files or len(files) == 0:
|
| 609 |
+
yield "Error: Please attach an image or audio file for Phi-4 processing."
|
| 610 |
+
return
|
| 611 |
+
file_input = files[0]
|
| 612 |
+
yield "🔄 Processing multimodal input with Phi-4..."
|
| 613 |
+
for partial in process_phi4(input_type, file_input, question):
|
| 614 |
+
yield partial
|
| 615 |
+
return
|
| 616 |
+
|
| 617 |
# --- Text and TTS branch ---
|
| 618 |
tts_prefix = "@tts"
|
| 619 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
|
| 702 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
| 703 |
],
|
| 704 |
examples=[
|
| 705 |
+
[{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
|
| 706 |
+
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
| 707 |
["@tts2 What causes rainbows to form?"],
|
| 708 |
["@image Chocolate dripping from a donut"],
|
| 709 |
["@3d A birthday cupcake with cherry"],
|
| 710 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
| 711 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
| 712 |
+
["@ragent Explain how a binary search algorithm works."],
|
| 713 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 714 |
["@tts1 Explain Tower of Hanoi"],
|
| 715 |
],
|
|
|
|
| 718 |
description=DESCRIPTION,
|
| 719 |
css=css,
|
| 720 |
fill_height=True,
|
| 721 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4, or plain text"),
|
| 722 |
stop_btn="Stop Generation",
|
| 723 |
multimodal=True,
|
| 724 |
)
|