Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Running

File size: 29,417 Bytes

import gradio as gr
import base64
import io
import os
from openai import OpenAI
import PyPDF2
from PIL import Image
import speech_recognition as sr
import tempfile
import cv2
import numpy as np
from typing import List, Tuple, Optional
import json
import pydub
from pydub import AudioSegment

class MultimodalChatbot:
    def __init__(self, api_key: str):
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key,
        )
        self.model = "google/gemma-3n-e2b-it:free"
        self.conversation_history = []
        
    def encode_image_to_base64(self, image) -> str:
        """Convert PIL Image to base64 string"""
        try:
            if isinstance(image, str):
                # If it's a file path
                with open(image, "rb") as img_file:
                    return base64.b64encode(img_file.read()).decode('utf-8')
            else:
                # If it's a PIL Image
                buffered = io.BytesIO()
                # Convert to RGB if it's RGBA
                if image.mode == 'RGBA':
                    image = image.convert('RGB')
                image.save(buffered, format="JPEG", quality=85)
                return base64.b64encode(buffered.getvalue()).decode('utf-8')
        except Exception as e:
            return f"Error encoding image: {str(e)}"
    
    def extract_pdf_text(self, pdf_file) -> str:
        """Extract text from PDF file"""
        try:
            if hasattr(pdf_file, 'name'):
                # Gradio file object
                pdf_path = pdf_file.name
            else:
                pdf_path = pdf_file
                
            text = ""
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    if page_text.strip():
                        text += f"Page {page_num + 1}:\n{page_text}\n\n"
            return text.strip() if text.strip() else "No text could be extracted from this PDF."
        except Exception as e:
            return f"Error extracting PDF: {str(e)}"
    
    def convert_audio_to_wav(self, audio_file) -> str:
        """Convert audio file to WAV format for speech recognition"""
        try:
            if hasattr(audio_file, 'name'):
                audio_path = audio_file.name
            else:
                audio_path = audio_file
            
            # Get file extension
            file_ext = os.path.splitext(audio_path)[1].lower()
            
            # If already WAV, return as is
            if file_ext == '.wav':
                return audio_path
            
            # Convert to WAV using pydub
            audio = AudioSegment.from_file(audio_path)
            # Export as WAV with proper settings for speech recognition
            wav_path = tempfile.mktemp(suffix='.wav')
            audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
            return wav_path
            
        except Exception as e:
            raise Exception(f"Error converting audio: {str(e)}")
    
    def transcribe_audio(self, audio_file) -> str:
        """Transcribe audio file to text"""
        try:
            recognizer = sr.Recognizer()
            
            # Convert audio to WAV format
            wav_path = self.convert_audio_to_wav(audio_file)
            
            with sr.AudioFile(wav_path) as source:
                # Adjust for ambient noise
                recognizer.adjust_for_ambient_noise(source, duration=0.2)
                audio_data = recognizer.record(source)
                
                # Try Google Speech Recognition
                try:
                    text = recognizer.recognize_google(audio_data)
                    return text
                except sr.UnknownValueError:
                    return "Could not understand the audio. Please try with clearer audio."
                except sr.RequestError as e:
                    # Fallback to offline recognition if available
                    try:
                        text = recognizer.recognize_sphinx(audio_data)
                        return text
                    except:
                        return f"Speech recognition service error: {str(e)}"
                        
        except Exception as e:
            return f"Error transcribing audio: {str(e)}"
    
    def process_video(self, video_file) -> Tuple[List[str], str]:
        """Extract frames from video and convert to base64"""
        try:
            if hasattr(video_file, 'name'):
                video_path = video_file.name
            else:
                video_path = video_file
                
            cap = cv2.VideoCapture(video_path)
            if not cap.isOpened():
                return [], "Error: Could not open video file"
            
            frames = []
            frame_descriptions = []
            frame_count = 0
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            
            # Extract frames (every 60 frames or every 2 seconds)
            frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
            
            while cap.read()[0] and len(frames) < 5:  # Limit to 5 frames
                ret, frame = cap.read()
                if ret and frame_count % frame_interval == 0:
                    # Convert BGR to RGB
                    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    pil_image = Image.fromarray(rgb_frame)
                    
                    # Resize image to reduce size
                    pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
                    
                    base64_frame = self.encode_image_to_base64(pil_image)
                    if not base64_frame.startswith("Error"):
                        frames.append(base64_frame)
                        timestamp = frame_count / fps if fps > 0 else frame_count
                        frame_descriptions.append(f"Frame at {timestamp:.1f}s")
                
                frame_count += 1
            
            cap.release()
            
            description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
            return frames, description
            
        except Exception as e:
            return [], f"Error processing video: {str(e)}"
    
    def create_multimodal_message(self, 
                                text_input: str = "",
                                pdf_file=None,
                                audio_file=None,
                                image_file=None,
                                video_file=None) -> dict:
        """Create a multimodal message for the API"""
        
        content_parts = []
        processing_info = []
        
        # Add text content
        if text_input:
            content_parts.append({"type": "text", "text": text_input})
        
        # Process PDF
        if pdf_file is not None:
            pdf_text = self.extract_pdf_text(pdf_file)
            content_parts.append({
                "type": "text", 
                "text": f"PDF Content:\n{pdf_text}"
            })
            processing_info.append("📄 PDF processed")
        
        # Process Audio
        if audio_file is not None:
            audio_text = self.transcribe_audio(audio_file)
            content_parts.append({
                "type": "text", 
                "text": f"Audio Transcription:\n{audio_text}"
            })
            processing_info.append("🎤 Audio transcribed")
        
        # Process Image - Use text-only approach since vision isn't supported
        if image_file is not None:
            # Since vision isn't supported, we'll describe what we can about the image
            if hasattr(image_file, 'size'):
                width, height = image_file.size
                mode = image_file.mode
                content_parts.append({
                    "type": "text",
                    "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Note: Visual analysis not available with current model configuration."
                })
            else:
                content_parts.append({
                    "type": "text",
                    "text": "Image uploaded. Note: Visual analysis not available with current model configuration."
                })
            processing_info.append("🖼️ Image received (metadata only)")
        
        # Process Video - Use text-only approach since vision isn't supported
        if video_file is not None:
            frames, video_desc = self.process_video(video_file)
            content_parts.append({
                "type": "text",
                "text": f"Video uploaded: {video_desc}. Note: Visual analysis not available with current model configuration."
            })
            processing_info.append("🎥 Video processed (metadata only)")
        
        return {"role": "user", "content": content_parts}, processing_info
    
    def chat(self, 
             text_input: str = "",
             pdf_file=None,
             audio_file=None,
             image_file=None,
             video_file=None,
             history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
        """Main chat function"""
        
        if history is None:
            history = []
        
        try:
            # Create user message summary for display
            user_message_parts = []
            if text_input:
                user_message_parts.append(f"Text: {text_input}")
            if pdf_file:
                user_message_parts.append("📄 PDF uploaded")
            if audio_file:
                user_message_parts.append("🎤 Audio uploaded")
            if image_file:
                user_message_parts.append("🖼️ Image uploaded")
            if video_file:
                user_message_parts.append("🎥 Video uploaded")
            
            user_display = " | ".join(user_message_parts)
            
            # Create multimodal message
            user_message, processing_info = self.create_multimodal_message(
                text_input, pdf_file, audio_file, image_file, video_file
            )
            
            # Add processing info to display
            if processing_info:
                user_display += f"\n{' | '.join(processing_info)}"
            
            # Add to conversation history
            messages = [user_message]
            
            # Get response from Gemma
            completion = self.client.chat.completions.create(
                extra_headers={
                    "HTTP-Referer": "https://multimodal-chatbot.local",
                    "X-Title": "Multimodal Chatbot",
                },
                model=self.model,
                messages=messages,
                max_tokens=2048,
                temperature=0.7
            )
            
            bot_response = completion.choices[0].message.content
            
            # Update history
            history.append((user_display, bot_response))
            
            return history, ""
            
        except Exception as e:
            error_msg = f"Error: {str(e)}"
            history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
            return history, ""

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🤖 Multimodal Chatbot with Gemma 3n
        
        This chatbot can process multiple types of input:
        - **Text**: Regular text messages
        - **PDF**: Extract and analyze document content  
        - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
        - **Images**: Upload images (metadata analysis only due to model limitations)
        - **Video**: Upload videos (metadata analysis only due to model limitations)
        
        **Setup**: Enter your OpenRouter API key below to get started
        """)
        
        # API Key Input Section
        with gr.Row():
            with gr.Column():
                api_key_input = gr.Textbox(
                    label="🔑 OpenRouter API Key",
                    placeholder="Enter your OpenRouter API key here...",
                    type="password",
                    info="Your API key is not stored and only used for this session"
                )
                api_status = gr.Textbox(
                    label="Connection Status",
                    value="❌ API Key not provided",
                    interactive=False
                )
        
        # Tabbed Interface
        with gr.Tabs():
            # Text Chat Tab
            with gr.TabItem("💬 Text Chat"):
                with gr.Row():
                    with gr.Column(scale=1):
                        text_input = gr.Textbox(
                            label="💬 Text Input",
                            placeholder="Type your message here...",
                            lines=5
                        )
                        text_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                        text_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                    
                    with gr.Column(scale=2):
                        text_chatbot = gr.Chatbot(
                            label="Text Chat History",
                            height=600,
                            bubble_full_width=False,
                            show_copy_button=True
                        )
            
            # PDF Chat Tab
            with gr.TabItem("📄 PDF Chat"):
                with gr.Row():
                    with gr.Column(scale=1):
                        pdf_input = gr.File(
                            label="📄 PDF Upload",
                            file_types=[".pdf"],
                            type="filepath"
                        )
                        pdf_text_input = gr.Textbox(
                            label="💬 Question about PDF",
                            placeholder="Ask something about the PDF...",
                            lines=3
                        )
                        pdf_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                        pdf_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                    
                    with gr.Column(scale=2):
                        pdf_chatbot = gr.Chatbot(
                            label="PDF Chat History",
                            height=600,
                            bubble_full_width=False,
                            show_copy_button=True
                        )
            
            # Audio Chat Tab
            with gr.TabItem("🎤 Audio Chat"):
                with gr.Row():
                    with gr.Column(scale=1):
                        audio_input = gr.File(
                            label="🎤 Audio Upload", 
                            file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                            type="filepath"
                        )
                        audio_text_input = gr.Textbox(
                            label="💬 Question about Audio",
                            placeholder="Ask something about the audio...",
                            lines=3
                        )
                        audio_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                        audio_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                    
                    with gr.Column(scale=2):
                        audio_chatbot = gr.Chatbot(
                            label="Audio Chat History",
                            height=600,
                            bubble_full_width=False,
                            show_copy_button=True
                        )
            
            # Image Chat Tab
            with gr.TabItem("🖼️ Image Chat"):
                with gr.Row():
                    with gr.Column(scale=1):
                        image_input = gr.Image(
                            label="🖼️ Image Upload",
                            type="pil"
                        )
                        image_text_input = gr.Textbox(
                            label="💬 Question about Image",
                            placeholder="Ask something about the image...",
                            lines=3
                        )
                        image_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                        image_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                    
                    with gr.Column(scale=2):
                        image_chatbot = gr.Chatbot(
                            label="Image Chat History",
                            height=600,
                            bubble_full_width=False,
                            show_copy_button=True
                        )
            
            # Video Chat Tab
            with gr.TabItem("🎥 Video Chat"):
                with gr.Row():
                    with gr.Column(scale=1):
                        video_input = gr.File(
                            label="🎥 Video Upload",
                            file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
                            type="filepath"
                        )
                        video_text_input = gr.Textbox(
                            label="💬 Question about Video",
                            placeholder="Ask something about the video...",
                            lines=3
                        )
                        video_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                        video_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                    
                    with gr.Column(scale=2):
                        video_chatbot = gr.Chatbot(
                            label="Video Chat History",
                            height=600,
                            bubble_full_width=False,
                            show_copy_button=True
                        )
            
            # Combined Chat Tab
            with gr.TabItem("🌟 Combined Chat"):
                with gr.Row():
                    with gr.Column(scale=1):
                        combined_text_input = gr.Textbox(
                            label="💬 Text Input",
                            placeholder="Type your message here...",
                            lines=3
                        )
                        
                        combined_pdf_input = gr.File(
                            label="📄 PDF Upload",
                            file_types=[".pdf"],
                            type="filepath"
                        )
                        
                        combined_audio_input = gr.File(
                            label="🎤 Audio Upload", 
                            file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                            type="filepath"
                        )
                        
                        combined_image_input = gr.Image(
                            label="🖼️ Image Upload",
                            type="pil"
                        )
                        
                        combined_video_input = gr.File(
                            label="🎥 Video Upload",
                            file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
                            type="filepath"
                        )
                        
                        combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
                        combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                    
                    with gr.Column(scale=2):
                        combined_chatbot = gr.Chatbot(
                            label="Combined Chat History",
                            height=600,
                            bubble_full_width=False,
                            show_copy_button=True
                        )
        
        # Event handlers
        def validate_api_key(api_key):
            if not api_key or len(api_key.strip()) == 0:
                return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
            
            try:
                # Test the API key by creating a client
                test_client = OpenAI(
                    base_url="https://openrouter.ai/api/v1",
                    api_key=api_key.strip(),
                )
                return "✅ API Key validated successfully", *[gr.update(interactive=True) for _ in range(6)]
            except Exception as e:
                return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(6)]
        
        def process_text_input(api_key, text, history):
            if not api_key or len(api_key.strip()) == 0:
                if history is None:
                    history = []
                history.append(("Error", "❌ Please provide a valid API key first"))
                return history, ""
            
            chatbot = MultimodalChatbot(api_key.strip())
            return chatbot.chat(text_input=text, history=history)
        
        def process_pdf_input(api_key, pdf, text, history):
            if not api_key or len(api_key.strip()) == 0:
                if history is None:
                    history = []
                history.append(("Error", "❌ Please provide a valid API key first"))
                return history, ""
            
            chatbot = MultimodalChatbot(api_key.strip())
            return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
        
        def process_audio_input(api_key, audio, text, history):
            if not api_key or len(api_key.strip()) == 0:
                if history is None:
                    history = []
                history.append(("Error", "❌ Please provide a valid API key first"))
                return history, ""
            
            chatbot = MultimodalChatbot(api_key.strip())
            return chatbot.chat(text_input=text, audio_file=audio, history=history)
        
        def process_image_input(api_key, image, text, history):
            if not api_key or len(api_key.strip()) == 0:
                if history is None:
                    history = []
                history.append(("Error", "❌ Please provide a valid API key first"))
                return history, ""
            
            chatbot = MultimodalChatbot(api_key.strip())
            return chatbot.chat(text_input=text, image_file=image, history=history)
        
        def process_video_input(api_key, video, text, history):
            if not api_key or len(api_key.strip()) == 0:
                if history is None:
                    history = []
                history.append(("Error", "❌ Please provide a valid API key first"))
                return history, ""
            
            chatbot = MultimodalChatbot(api_key.strip())
            return chatbot.chat(text_input=text, video_file=video, history=history)
        
        def process_combined_input(api_key, text, pdf, audio, image, video, history):
            if not api_key or len(api_key.strip()) == 0:
                if history is None:
                    history = []
                history.append(("Error", "❌ Please provide a valid API key first"))
                return history, ""
            
            chatbot = MultimodalChatbot(api_key.strip())
            return chatbot.chat(text, pdf, audio, image, video, history)
        
        def clear_chat():
            return [], ""
        
        def clear_all_inputs():
            return [], "", None, None, None, None
        
        # API Key validation
        api_key_input.change(
            validate_api_key,
            inputs=[api_key_input],
            outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, 
                    image_submit_btn, video_submit_btn, combined_submit_btn]
        )
        
        # Text chat events
        text_submit_btn.click(
            process_text_input,
            inputs=[api_key_input, text_input, text_chatbot],
            outputs=[text_chatbot, text_input]
        )
        text_input.submit(
            process_text_input,
            inputs=[api_key_input, text_input, text_chatbot],
            outputs=[text_chatbot, text_input]
        )
        text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
        
        # PDF chat events
        pdf_submit_btn.click(
            process_pdf_input,
            inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
            outputs=[pdf_chatbot, pdf_text_input]
        )
        pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
        
        # Audio chat events
        audio_submit_btn.click(
            process_audio_input,
            inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
            outputs=[audio_chatbot, audio_text_input]
        )
        audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
        
        # Image chat events
        image_submit_btn.click(
            process_image_input,
            inputs=[api_key_input, image_input, image_text_input, image_chatbot],
            outputs=[image_chatbot, image_text_input]
        )
        image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
        
        # Video chat events
        video_submit_btn.click(
            process_video_input,
            inputs=[api_key_input, video_input, video_text_input, video_chatbot],
            outputs=[video_chatbot, video_text_input]
        )
        video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
        
        # Combined chat events
        combined_submit_btn.click(
            process_combined_input,
            inputs=[api_key_input, combined_text_input, combined_pdf_input, 
                   combined_audio_input, combined_image_input, combined_video_input, combined_chatbot],
            outputs=[combined_chatbot, combined_text_input]
        )
        combined_clear_btn.click(clear_all_inputs, 
                               outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
                                      combined_audio_input, combined_image_input, combined_video_input])
        
        # Examples and Instructions
        gr.Markdown("""
        ### 🎯 How to Use Each Tab:
        
        **💬 Text Chat**: Simple text conversations with the AI
        
        **📄 PDF Chat**: Upload a PDF and ask questions about its content
        
        **🎤 Audio Chat**: Upload audio files for transcription and analysis
        - Supports: WAV, MP3, M4A, FLAC, OGG formats
        - Best results with clear speech and minimal background noise
        
        **🖼️ Image Chat**: Upload images (currently metadata only due to model limitations)
        
        **🎥 Video Chat**: Upload videos (currently metadata only due to model limitations)
        
        **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
        
        ### 🔑 Getting an API Key:
        1. Go to [OpenRouter.ai](https://openrouter.ai)
        2. Sign up for an account
        3. Navigate to the API Keys section
        4. Create a new API key
        5. Copy and paste it in the field above
        
        ### ⚠️ Current Limitations:
        - Image and video visual analysis not supported by the free Gemma 3n model
        - Audio transcription requires internet connection for best results
        - Large files may take longer to process
        """)
    
    return demo

if __name__ == "__main__":
    # Required packages (install with pip):
    required_packages = [
        "gradio",
        "openai", 
        "PyPDF2",
        "Pillow",
        "SpeechRecognition",
        "opencv-python",
        "numpy",
        "pydub"
    ]
    
    print("🚀 Multimodal Chatbot with Gemma 3n")
    print("=" * 50)
    print("Required packages:", ", ".join(required_packages))
    print("\n📦 To install: pip install " + " ".join(required_packages))
    print("\n🎤 For audio processing, you may also need:")
    print("   - ffmpeg (for audio conversion)")
    print("   - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)")
    print("\n🔑 Get your API key from: https://openrouter.ai")
    print("💡 Enter your API key in the web interface when it loads")
    
    demo = create_interface()
    demo.launch(
        share=True
    )