PHI4-Multimodal

Runtime error

App Files Files Community

prithivMLmods commited on Feb 28

Commit

fcf45c6

verified ·

1 Parent(s): e680658

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -23

app.py CHANGED Viewed

@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -259,16 +258,7 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
 # ------------------------------------------------------------------------------
 DESCRIPTION = """
-# Agent Dino 🌠
-This chatbot supports various commands:
-- **@tts1 / @tts2:** text-to-speech
-- **@image:** image generation
-- **@3d:** 3D mesh generation
-- **@web:** web search/visit
-- **@rAgent:** reasoning chain
-- **@yolo:** object detection
-- **@phi4:** multimodal (image/audio) question answering
-"""
 css = '''
 h1 {
@@ -582,14 +572,15 @@ def generate(
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
         # Determine input type (Image or Audio) from the first file
         input_file = files[0]
         try:
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
@@ -599,7 +590,7 @@ def generate(
         except Exception:
             input_type = "Audio"
             file_for_phi4 = input_file
         if input_type == "Image":
             phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
             inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
@@ -610,22 +601,20 @@ def generate(
         else:
             yield "Invalid file type for @phi4 multimodal processing."
             return
         with torch.no_grad():
             generate_ids = phi4_model.generate(
                 **inputs,
                 max_new_tokens=200,
                 num_logits_to_keep=0,
-                streamer=streamer  # Adding text streamer
             )
-        buffer = "⚛️ phi4 multimodal is initiated, hold tight"
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
     # --- Text and TTS branch ---
     tts_prefix = "@tts"

 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
 # ------------------------------------------------------------------------------
 DESCRIPTION = """
+# Agent Dino 🌠"""
 css = '''
 h1 {
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
         # Determine input type (Image or Audio) from the first file
         input_file = files[0]
         try:
+            # If file is already a PIL Image, treat as image
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
+                # Try opening as image; if it fails, assume audio
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
         except Exception:
             input_type = "Audio"
             file_for_phi4 = input_file
         if input_type == "Image":
             phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
             inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
         else:
             yield "Invalid file type for @phi4 multimodal processing."
             return
         with torch.no_grad():
             generate_ids = phi4_model.generate(
                 **inputs,
                 max_new_tokens=200,
                 num_logits_to_keep=0,
             )
+        input_length = inputs['input_ids'].shape[1]
+        generate_ids = generate_ids[:, input_length:]
+        response = phi4_processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        yield response
+        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"