Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

dcec17f

verified ·

1 Parent(s): dc31593

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -258

app.py CHANGED Viewed

@@ -4,17 +4,10 @@ import io
 import os
 from openai import OpenAI
 import PyPDF2
-from PIL import Image
 import speech_recognition as sr
 import tempfile
-import cv2
-import numpy as np
-from typing import List, Tuple, Optional
-import json
-import pydub
 from pydub import AudioSegment
-from transformers import pipeline
-import torch
 class MultimodalChatbot:
     def __init__(self, api_key: str):
@@ -22,54 +15,23 @@ class MultimodalChatbot:
             base_url="https://openrouter.ai/api/v1",
             api_key=api_key,
         )
-        self.model = "google/gemma-2-9b-it:free"
         self.conversation_history = []
-        # Initialize the pipeline for image-text-to-text processing
-        try:
-            self.pipe = pipeline(
-                "image-captioning",
-                model="Salesforce/blip-image-captioning-base",
-                device="cpu",  # Optimized for CPU in HF Spaces
-                torch_dtype=torch.float32,  # Use float32 for CPU compatibility
-            )
-            print("Image captioning pipeline initialized successfully")
-        except Exception as e:
-            print(f"Error initializing image captioning pipeline: {e}")
-            self.pipe = None
-    def encode_image_to_base64(self, image) -> str:
-        """Convert PIL Image or file path to base64 string"""
-        try:
-            if isinstance(image, str):
-                with open(image, "rb") as img_file:
-                    return base64.b64encode(img_file.read()).decode('utf-8')
-            elif isinstance(image, Image.Image):
-                buffered = io.BytesIO()
-                if image.mode == 'RGBA':
-                    image = image.convert('RGB')
-                image.save(buffered, format="JPEG", quality=85)
-                return base64.b64encode(buffered.getvalue()).decode('utf-8')
-            else:
-                raise ValueError("Invalid image input")
-        except Exception as e:
-            return f"Error encoding image: {str(e)}"
     def extract_pdf_text(self, pdf_file) -> str:
         """Extract text from PDF file"""
         try:
-            if isinstance(pdf_file, str):
-                pdf_path = pdf_file
-            elif hasattr(pdf_file, 'name'):
                 pdf_path = pdf_file.name
             else:
-                raise ValueError("Invalid PDF file input")
             text = ""
             with open(pdf_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
                 for page_num, page in enumerate(pdf_reader.pages):
                     page_text = page.extract_text()
-                    if page_text and page_text.strip():
                         text += f"Page {page_num + 1}:\n{page_text}\n\n"
             return text.strip() if text.strip() else "No text could be extracted from this PDF."
         except Exception as e:
@@ -78,12 +40,10 @@ class MultimodalChatbot:
     def convert_audio_to_wav(self, audio_file) -> str:
         """Convert audio file to WAV format for speech recognition"""
         try:
-            if isinstance(audio_file, str):
-                audio_path = audio_file
-            elif hasattr(audio_file, 'name'):
                 audio_path = audio_file.name
             else:
-                raise ValueError("Invalid audio file input")
             file_ext = os.path.splitext(audio_path)[1].lower()
             if file_ext == '.wav':
@@ -94,7 +54,7 @@ class MultimodalChatbot:
             audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
             return wav_path
         except Exception as e:
-            return f"Error converting audio: {str(e)}"
     def transcribe_audio(self, audio_file) -> str:
         """Transcribe audio file to text"""
@@ -105,6 +65,7 @@ class MultimodalChatbot:
             with sr.AudioFile(wav_path) as source:
                 recognizer.adjust_for_ambient_noise(source, duration=0.2)
                 audio_data = recognizer.record(source)
                 try:
                     text = recognizer.recognize_google(audio_data)
                     return text
@@ -119,47 +80,10 @@ class MultimodalChatbot:
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
-    def extract_video_frame(self, video_file, frame_number=None):
-        """Extract a frame from the video"""
-        try:
-            if isinstance(video_file, str):
-                video_path = video_file
-            elif hasattr(video_file, 'name'):
-                video_path = video_file.name
-            else:
-                raise ValueError("Invalid video file input")
-            cap = cv2.VideoCapture(video_path)
-            if not cap.isOpened():
-                return None, "Could not open video file"
-            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            if total_frames <= 0:
-                cap.release()
-                return None, "Video has no frames"
-            if frame_number is None:
-                frame_number = total_frames // 2  # Extract middle frame
-            if frame_number >= total_frames:
-                frame_number = total_frames - 1
-            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
-            ret, frame = cap.read()
-            cap.release()
-            if ret:
-                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                return Image.fromarray(frame), f"Extracted frame {frame_number} of {total_frames}"
-            else:
-                return None, "Failed to extract frame"
-        except Exception as e:
-            return None, f"Error extracting video frame: {str(e)}"
     def create_multimodal_message(self,
                                 text_input: str = "",
                                 pdf_file=None,
-                                audio_file=None,
-                                image_file=None,
-                                video_file=None) -> dict:
         """Create a multimodal message for the API"""
         content_parts = []
         processing_info = []
@@ -169,64 +93,26 @@ class MultimodalChatbot:
         if pdf_file is not None:
             pdf_text = self.extract_pdf_text(pdf_file)
-            content_parts.append({"type": "text", "text": f"PDF Content:\n{pdf_text}"})
             processing_info.append("📄 PDF processed")
         if audio_file is not None:
             audio_text = self.transcribe_audio(audio_file)
-            content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
             processing_info.append("🎤 Audio transcribed")
-        if image_file is not None and self.pipe is not None:
-            try:
-                if isinstance(image_file, str):
-                    image = Image.open(image_file)
-                else:
-                    image = image_file
-                # Use BLIP model for image captioning
-                output = self.pipe(image)
-                description = output[0]['generated_caption']
-                if text_input:
-                    content_parts.append({"type": "text", "text": f"Image analysis (based on '{text_input}'): {description}"})
-                else:
-                    content_parts.append({"type": "text", "text": f"Image analysis: {description}"})
-                processing_info.append("🖼️ Image analyzed")
-            except Exception as e:
-                content_parts.append({"type": "text", "text": f"Error analyzing image: {str(e)}"})
-                processing_info.append("🖼️ Image analysis failed")
-        elif image_file is not None:
-            content_parts.append({"type": "text", "text": "Image uploaded. Analysis failed due to model initialization error."})
-            processing_info.append("🖼️ Image received (analysis failed)")
-        if video_file is not None and self.pipe is not None:
-            frame, frame_info = self.extract_video_frame(video_file)
-            if frame:
-                try:
-                    output = self.pipe(frame)
-                    description = output[0]['generated_caption']
-                    if text_input:
-                        content_parts.append({"type": "text", "text": f"Video frame analysis (based on '{text_input}'): {description}. Frame info: {frame_info}. Please describe the video for further assistance."})
-                    else:
-                        content_parts.append({"type": "text", "text": f"Video frame analysis: {description}. Frame info: {frame_info}. Please describe the video for further assistance."})
-                    processing_info.append("🎥 Video frame analyzed")
-                except Exception as e:
-                    content_parts.append({"type": "text", "text": f"Error analyzing video frame: {str(e)}. Frame info: {frame_info}"})
-                    processing_info.append("🎥 Video frame analysis failed")
-            else:
-                content_parts.append({"type": "text", "text": f"Could not extract frame from video: {frame_info}. Please describe the video."})
-                processing_info.append("🎥 Video processing failed")
-        elif video_file is not None:
-            content_parts.append({"type": "text", "text": "Video uploaded. Analysis failed due to model initialization error."})
-            processing_info.append("🎥 Video received (analysis failed)")
         return {"role": "user", "content": content_parts}, processing_info
     def chat(self,
              text_input: str = "",
              pdf_file=None,
              audio_file=None,
-             image_file=None,
-             video_file=None,
              history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
         """Main chat function"""
         if history is None:
@@ -240,20 +126,18 @@ class MultimodalChatbot:
                 user_message_parts.append("📄 PDF uploaded")
             if audio_file:
                 user_message_parts.append("🎤 Audio uploaded")
-            if image_file:
-                user_message_parts.append("🖼️ Image uploaded")
-            if video_file:
-                user_message_parts.append("🎥 Video uploaded")
             user_display = " | ".join(user_message_parts)
             user_message, processing_info = self.create_multimodal_message(
-                text_input, pdf_file, audio_file, image_file, video_file
             )
             if processing_info:
                 user_display += f"\n{' | '.join(processing_info)}"
             messages = [user_message]
             completion = self.client.chat.completions.create(
                 extra_headers={
                     "HTTP-Referer": "https://multimodal-chatbot.local",
@@ -267,7 +151,9 @@ class MultimodalChatbot:
             bot_response = completion.choices[0].message.content
             history.append((user_display, bot_response))
             return history, ""
         except Exception as e:
             error_msg = f"Error: {str(e)}"
             history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
@@ -275,16 +161,14 @@ class MultimodalChatbot:
 def create_interface():
     """Create the Gradio interface"""
-    with gr.Blocks(title="Multimodal Chatbot with BLIP and Gemma", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
-        # 🤖 Multimodal Chatbot with BLIP and Gemma
         This chatbot can process multiple types of input:
-        - **Text**: Regular text messages using Gemma
         - **PDF**: Extract and analyze document content
         - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
-        - **Images**: Upload images for analysis using BLIP
-        - **Video**: Upload videos for basic frame analysis using BLIP
         **Setup**: Enter your OpenRouter API key below to get started
         """)
@@ -314,6 +198,7 @@ def create_interface():
                         )
                         text_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         text_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         text_chatbot = gr.Chatbot(
                             label="Text Chat History",
@@ -337,6 +222,7 @@ def create_interface():
                         )
                         pdf_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         pdf_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         pdf_chatbot = gr.Chatbot(
                             label="PDF Chat History",
@@ -360,6 +246,7 @@ def create_interface():
                         )
                         audio_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         audio_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         audio_chatbot = gr.Chatbot(
                             label="Audio Chat History",
@@ -368,51 +255,6 @@ def create_interface():
                             show_copy_button=True
                         )
-            with gr.TabItem("🖼️ Image Chat"):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        image_input = gr.Image(
-                            label="🖼️ Image Upload",
-                            type="pil"
-                        )
-                        image_text_input = gr.Textbox(
-                            label="💬 Question about Image",
-                            placeholder="Ask something about the image...",
-                            lines=3
-                        )
-                        image_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
-                        image_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
-                    with gr.Column(scale=2):
-                        image_chatbot = gr.Chatbot(
-                            label="Image Chat History",
-                            height=600,
-                            bubble_full_width=False,
-                            show_copy_button=True
-                        )
-            with gr.TabItem("🎥 Video Chat"):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        video_input = gr.File(
-                            label="🎥 Video Upload",
-                            file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
-                            type="filepath"
-                        )
-                        video_text_input = gr.Textbox(
-                            label="💬 Question about Video",
-                            placeholder="Ask something about the video...",
-                            lines=3
-                        )
-                        video_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
-                        video_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
-                    with gr.Column(scale=2):
-                        video_chatbot = gr.Chatbot(
-                            label="Video Chat History",
-                            height=600,
-                            bubble_full_width=False,
-                            show_copy_button=True
-                        )
             with gr.TabItem("🌟 Combined Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -431,17 +273,9 @@ def create_interface():
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
-                        combined_image_input = gr.Image(
-                            label="🖼️ Image Upload",
-                            type="pil"
-                        )
-                        combined_video_input = gr.File(
-                            label="🎥 Video Upload",
-                            file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
-                            type="filepath"
-                        )
                         combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
                         combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                     with gr.Column(scale=2):
                         combined_chatbot = gr.Chatbot(
                             label="Combined Chat History",
@@ -452,15 +286,16 @@ def create_interface():
         def validate_api_key(api_key):
             if not api_key or len(api_key.strip()) == 0:
-                return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
             try:
                 test_client = OpenAI(
                     base_url="https://openrouter.ai/api/v1",
                     api_key=api_key.strip(),
                 )
-                return "✅ API Key validated successfully", *[gr.update(interactive=True) for _ in range(6)]
             except Exception as e:
-                return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(6)]
         def process_text_input(api_key, text, history):
             if not api_key or len(api_key.strip()) == 0:
@@ -468,6 +303,7 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, history=history)
@@ -477,6 +313,7 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
@@ -486,47 +323,30 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, audio_file=audio, history=history)
-        def process_image_input(api_key, image, text, history):
-            if not api_key or len(api_key.strip()) == 0:
-                if history is None:
-                    history = []
-                history.append(("Error", "❌ Please provide a valid API key first"))
-                return history, ""
-            chatbot = MultimodalChatbot(api_key.strip())
-            return chatbot.chat(text_input=text, image_file=image, history=history)
-        def process_video_input(api_key, video, text, history):
-            if not api_key or len(api_key.strip()) == 0:
-                if history is None:
-                    history = []
-                history.append(("Error", "❌ Please provide a valid API key first"))
-                return history, ""
-            chatbot = MultimodalChatbot(api_key.strip())
-            return chatbot.chat(text_input=text, video_file=video, history=history)
-        def process_combined_input(api_key, text, pdf, audio, image, video, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
-            return chatbot.chat(text_input=text, pdf_file=pdf, audio_file=audio, image_file=image, video_file=video, history=history)
         def clear_chat():
             return [], ""
         def clear_all_inputs():
-            return [], "", None, None, None, None
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
-            outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn,
-                    image_submit_btn, video_submit_btn, combined_submit_btn]
         )
         text_submit_btn.click(
@@ -555,34 +375,20 @@ def create_interface():
         )
         audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
-        image_submit_btn.click(
-            process_image_input,
-            inputs=[api_key_input, image_input, image_text_input, image_chatbot],
-            outputs=[image_chatbot, image_text_input]
-        )
-        image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
-        video_submit_btn.click(
-            process_video_input,
-            inputs=[api_key_input, video_input, video_text_input, video_chatbot],
-            outputs=[video_chatbot, video_text_input]
-        )
-        video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
         combined_submit_btn.click(
             process_combined_input,
             inputs=[api_key_input, combined_text_input, combined_pdf_input,
-                   combined_audio_input, combined_image_input, combined_video_input, combined_chatbot],
             outputs=[combined_chatbot, combined_text_input]
         )
         combined_clear_btn.click(clear_all_inputs,
-                               outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
-                                      combined_audio_input, combined_image_input, combined_video_input])
         gr.Markdown("""
         ### 🎯 How to Use Each Tab:
-        **💬 Text Chat**: Simple text conversations with the AI using Gemma
         **📄 PDF Chat**: Upload a PDF and ask questions about its content
@@ -590,12 +396,6 @@ def create_interface():
         - Supports: WAV, MP3, M4A, FLAC, OGG formats
         - Best results with clear speech and minimal background noise
-        **🖼️ Image Chat**: Upload images for analysis using BLIP
-        - Provide a text prompt to guide the analysis (e.g., "What is in this image?")
-        **🎥 Video Chat**: Upload videos for basic frame analysis using BLIP
-        - Analysis is based on a single frame; provide a text description for full video context
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
         ### 🔑 Getting an API Key:
@@ -606,10 +406,8 @@ def create_interface():
         5. Copy and paste it in the field above
         ### ⚠️ Current Limitations:
-        - Image and video analysis may be slow on CPU in Hugging Face Spaces
-        - Video analysis is limited to a single frame due to CPU constraints
         - Large files may take longer to process
-        - BLIP model may provide basic captions; detailed video descriptions require additional user input
         """)
     return demo
@@ -619,16 +417,11 @@ if __name__ == "__main__":
         "gradio",
         "openai",
         "PyPDF2",
-        "Pillow",
         "SpeechRecognition",
-        "opencv-python",
-        "numpy",
-        "pydub",
-        "transformers",
-        "torch"
     ]
-    print("🚀 Multimodal Chatbot with BLIP and Gemma")
     print("=" * 50)
     print("Required packages:", ", ".join(required_packages))
     print("\n📦 To install: pip install " + " ".join(required_packages))
@@ -639,4 +432,6 @@ if __name__ == "__main__":
     print("💡 Enter your API key in the web interface when it loads")
     demo = create_interface()
-    demo.launch(share=True) #

 import os
 from openai import OpenAI
 import PyPDF2
 import speech_recognition as sr
 import tempfile
 from pydub import AudioSegment
+from typing import List, Tuple, Optional
 class MultimodalChatbot:
     def __init__(self, api_key: str):
             base_url="https://openrouter.ai/api/v1",
             api_key=api_key,
         )
+        self.model = "google/gemma-3n-e2b-it:free"
         self.conversation_history = []
     def extract_pdf_text(self, pdf_file) -> str:
         """Extract text from PDF file"""
         try:
+            if hasattr(pdf_file, 'name'):
                 pdf_path = pdf_file.name
             else:
+                pdf_path = pdf_file
             text = ""
             with open(pdf_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
                 for page_num, page in enumerate(pdf_reader.pages):
                     page_text = page.extract_text()
+                    if page_text.strip():
                         text += f"Page {page_num + 1}:\n{page_text}\n\n"
             return text.strip() if text.strip() else "No text could be extracted from this PDF."
         except Exception as e:
     def convert_audio_to_wav(self, audio_file) -> str:
         """Convert audio file to WAV format for speech recognition"""
         try:
+            if hasattr(audio_file, 'name'):
                 audio_path = audio_file.name
             else:
+                audio_path = audio_file
             file_ext = os.path.splitext(audio_path)[1].lower()
             if file_ext == '.wav':
             audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
             return wav_path
         except Exception as e:
+            raise Exception(f"Error converting audio: {str(e)}")
     def transcribe_audio(self, audio_file) -> str:
         """Transcribe audio file to text"""
             with sr.AudioFile(wav_path) as source:
                 recognizer.adjust_for_ambient_noise(source, duration=0.2)
                 audio_data = recognizer.record(source)
                 try:
                     text = recognizer.recognize_google(audio_data)
                     return text
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
     def create_multimodal_message(self,
                                 text_input: str = "",
                                 pdf_file=None,
+                                audio_file=None) -> dict:
         """Create a multimodal message for the API"""
         content_parts = []
         processing_info = []
         if pdf_file is not None:
             pdf_text = self.extract_pdf_text(pdf_file)
+            content_parts.append({
+                "type": "text",
+                "text": f"PDF Content:\n{pdf_text}"
+            })
             processing_info.append("📄 PDF processed")
         if audio_file is not None:
             audio_text = self.transcribe_audio(audio_file)
+            content_parts.append({
+                "type": "text",
+                "text": f"Audio Transcription:\n{audio_text}"
+            })
             processing_info.append("🎤 Audio transcribed")
         return {"role": "user", "content": content_parts}, processing_info
     def chat(self,
              text_input: str = "",
              pdf_file=None,
              audio_file=None,
              history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
         """Main chat function"""
         if history is None:
                 user_message_parts.append("📄 PDF uploaded")
             if audio_file:
                 user_message_parts.append("🎤 Audio uploaded")
             user_display = " | ".join(user_message_parts)
             user_message, processing_info = self.create_multimodal_message(
+                text_input, pdf_file, audio_file
             )
             if processing_info:
                 user_display += f"\n{' | '.join(processing_info)}"
             messages = [user_message]
             completion = self.client.chat.completions.create(
                 extra_headers={
                     "HTTP-Referer": "https://multimodal-chatbot.local",
             bot_response = completion.choices[0].message.content
             history.append((user_display, bot_response))
             return history, ""
         except Exception as e:
             error_msg = f"Error: {str(e)}"
             history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
 def create_interface():
     """Create the Gradio interface"""
+    with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
+        # 🤖 Multimodal Chatbot with Gemma 3n
         This chatbot can process multiple types of input:
+        - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
         - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
         **Setup**: Enter your OpenRouter API key below to get started
         """)
                         )
                         text_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         text_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         text_chatbot = gr.Chatbot(
                             label="Text Chat History",
                         )
                         pdf_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         pdf_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         pdf_chatbot = gr.Chatbot(
                             label="PDF Chat History",
                         )
                         audio_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         audio_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         audio_chatbot = gr.Chatbot(
                             label="Audio Chat History",
                             show_copy_button=True
                         )
             with gr.TabItem("🌟 Combined Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
                         combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
                         combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                     with gr.Column(scale=2):
                         combined_chatbot = gr.Chatbot(
                             label="Combined Chat History",
         def validate_api_key(api_key):
             if not api_key or len(api_key.strip()) == 0:
+                return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(4)]
             try:
                 test_client = OpenAI(
                     base_url="https://openrouter.ai/api/v1",
                     api_key=api_key.strip(),
                 )
+                return "✅ API Key validated successfully", *[gr.update(interactive=True) for _ in range(4)]
             except Exception as e:
+                return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(4)]
         def process_text_input(api_key, text, history):
             if not api_key or len(api_key.strip()) == 0:
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, history=history)
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, audio_file=audio, history=history)
+        def process_combined_input(api_key, text, pdf, audio, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text, pdf, audio, history)
         def clear_chat():
             return [], ""
         def clear_all_inputs():
+            return [], "", None, None
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
+            outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, combined_submit_btn]
         )
         text_submit_btn.click(
         )
         audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
         combined_submit_btn.click(
             process_combined_input,
             inputs=[api_key_input, combined_text_input, combined_pdf_input,
+                   combined_audio_input, combined_chatbot],
             outputs=[combined_chatbot, combined_text_input]
         )
         combined_clear_btn.click(clear_all_inputs,
+                               outputs=[combined_chatbot, combined_text_input,
+                                      combined_pdf_input, combined_audio_input])
         gr.Markdown("""
         ### 🎯 How to Use Each Tab:
+        **💬 Text Chat**: Simple text conversations with the AI
         **📄 PDF Chat**: Upload a PDF and ask questions about its content
         - Supports: WAV, MP3, M4A, FLAC, OGG formats
         - Best results with clear speech and minimal background noise
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
         ### 🔑 Getting an API Key:
         5. Copy and paste it in the field above
         ### ⚠️ Current Limitations:
+        - Audio transcription requires internet connection for best results
         - Large files may take longer to process
         """)
     return demo
         "gradio",
         "openai",
         "PyPDF2",
         "SpeechRecognition",
+        "pydub"
     ]
+    print("🚀 Multimodal Chatbot with Gemma 3n")
     print("=" * 50)
     print("Required packages:", ", ".join(required_packages))
     print("\n📦 To install: pip install " + " ".join(required_packages))
     print("💡 Enter your API key in the web interface when it loads")
     demo = create_interface()
+    demo.launch(
+        share=True
+    )