Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

38b2ece

verified ·

1 Parent(s): a220a8f

Create app.py

Browse files

Files changed (1) hide show

app.py +334 -0

app.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import gradio as gr
+import base64
+import io
+import os
+from openai import OpenAI
+import PyPDF2
+from PIL import Image
+import speech_recognition as sr
+import tempfile
+import cv2
+import numpy as np
+from typing import List, Tuple, Optional
+import json
+class MultimodalChatbot:
+    def __init__(self, api_key: str):
+        self.client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+        )
+        self.model = "google/gemma-3n-e2b-it:free"
+        self.conversation_history = []
+    def encode_image_to_base64(self, image) -> str:
+        """Convert PIL Image to base64 string"""
+        if isinstance(image, str):
+            # If it's a file path
+            with open(image, "rb") as img_file:
+                return base64.b64encode(img_file.read()).decode('utf-8')
+        else:
+            # If it's a PIL Image
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')
+    def extract_pdf_text(self, pdf_file) -> str:
+        """Extract text from PDF file"""
+        try:
+            if hasattr(pdf_file, 'name'):
+                # Gradio file object
+                pdf_path = pdf_file.name
+            else:
+                pdf_path = pdf_file
+            text = ""
+            with open(pdf_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page in pdf_reader.pages:
+                    text += page.extract_text() + "\n"
+            return text.strip()
+        except Exception as e:
+            return f"Error extracting PDF: {str(e)}"
+    def transcribe_audio(self, audio_file) -> str:
+        """Transcribe audio file to text"""
+        try:
+            recognizer = sr.Recognizer()
+            if hasattr(audio_file, 'name'):
+                audio_path = audio_file.name
+            else:
+                audio_path = audio_file
+            with sr.AudioFile(audio_path) as source:
+                audio_data = recognizer.record(source)
+                text = recognizer.recognize_google(audio_data)
+                return text
+        except Exception as e:
+            return f"Error transcribing audio: {str(e)}"
+    def process_video(self, video_file) -> List[str]:
+        """Extract frames from video and convert to base64"""
+        try:
+            if hasattr(video_file, 'name'):
+                video_path = video_file.name
+            else:
+                video_path = video_file
+            cap = cv2.VideoCapture(video_path)
+            frames = []
+            frame_count = 0
+            # Extract frames (every 30 frames to avoid too many)
+            while cap.read()[0] and frame_count < 10:  # Limit to 10 frames
+                ret, frame = cap.read()
+                if ret and frame_count % 30 == 0:
+                    # Convert BGR to RGB
+                    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    pil_image = Image.fromarray(rgb_frame)
+                    base64_frame = self.encode_image_to_base64(pil_image)
+                    frames.append(base64_frame)
+                frame_count += 1
+            cap.release()
+            return frames
+        except Exception as e:
+            return [f"Error processing video: {str(e)}"]
+    def create_multimodal_message(self,
+                                text_input: str = "",
+                                pdf_file=None,
+                                audio_file=None,
+                                image_file=None,
+                                video_file=None) -> dict:
+        """Create a multimodal message for the API"""
+        content_parts = []
+        # Add text content
+        if text_input:
+            content_parts.append({"type": "text", "text": text_input})
+        # Process PDF
+        if pdf_file is not None:
+            pdf_text = self.extract_pdf_text(pdf_file)
+            content_parts.append({
+                "type": "text",
+                "text": f"PDF Content:\n{pdf_text}"
+            })
+        # Process Audio
+        if audio_file is not None:
+            audio_text = self.transcribe_audio(audio_file)
+            content_parts.append({
+                "type": "text",
+                "text": f"Audio Transcription:\n{audio_text}"
+            })
+        # Process Image
+        if image_file is not None:
+            image_base64 = self.encode_image_to_base64(image_file)
+            content_parts.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/png;base64,{image_base64}"
+                }
+            })
+        # Process Video
+        if video_file is not None:
+            video_frames = self.process_video(video_file)
+            for i, frame_base64 in enumerate(video_frames):
+                if not frame_base64.startswith("Error"):
+                    content_parts.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{frame_base64}"
+                        }
+                    })
+        return {"role": "user", "content": content_parts}
+    def chat(self,
+             text_input: str = "",
+             pdf_file=None,
+             audio_file=None,
+             image_file=None,
+             video_file=None,
+             history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
+        """Main chat function"""
+        if history is None:
+            history = []
+        try:
+            # Create user message summary for display
+            user_message_parts = []
+            if text_input:
+                user_message_parts.append(f"Text: {text_input}")
+            if pdf_file:
+                user_message_parts.append("📄 PDF uploaded")
+            if audio_file:
+                user_message_parts.append("🎤 Audio uploaded")
+            if image_file:
+                user_message_parts.append("🖼️ Image uploaded")
+            if video_file:
+                user_message_parts.append("🎥 Video uploaded")
+            user_display = " | ".join(user_message_parts)
+            # Create multimodal message
+            user_message = self.create_multimodal_message(
+                text_input, pdf_file, audio_file, image_file, video_file
+            )
+            # Add to conversation history
+            messages = [user_message]
+            # Get response from Gemma
+            completion = self.client.chat.completions.create(
+                extra_headers={
+                    "HTTP-Referer": "https://multimodal-chatbot.local",
+                    "X-Title": "Multimodal Chatbot",
+                },
+                model=self.model,
+                messages=messages,
+                max_tokens=1024,
+                temperature=0.7
+            )
+            bot_response = completion.choices[0].message.content
+            # Update history
+            history.append((user_display, bot_response))
+            return history, ""
+        except Exception as e:
+            error_msg = f"Error: {str(e)}"
+            history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
+            return history, ""
+def create_interface():
+    """Create the Gradio interface"""
+    # Initialize chatbot (you'll need to set your API key)
+    api_key = os.getenv("OPENROUTER_API_KEY", "your_api_key_here")
+    chatbot = MultimodalChatbot(api_key)
+    with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🤖 Multimodal Chatbot with Gemma 3n
+        This chatbot can process multiple types of input:
+        - **Text**: Regular text messages
+        - **PDF**: Extract and analyze document content
+        - **Audio**: Transcribe speech to text
+        - **Images**: Analyze visual content
+        - **Video**: Extract frames and analyze video content
+        **Setup**: Set your OpenRouter API key as an environment variable `OPENROUTER_API_KEY`
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Input components
+                text_input = gr.Textbox(
+                    label="💬 Text Input",
+                    placeholder="Type your message here...",
+                    lines=3
+                )
+                pdf_input = gr.File(
+                    label="📄 PDF Upload",
+                    file_types=[".pdf"],
+                    type="filepath"
+                )
+                audio_input = gr.File(
+                    label="🎤 Audio Upload",
+                    file_types=[".wav", ".mp3", ".m4a", ".flac"],
+                    type="filepath"
+                )
+                image_input = gr.Image(
+                    label="🖼️ Image Upload",
+                    type="pil"
+                )
+                video_input = gr.File(
+                    label="🎥 Video Upload",
+                    file_types=[".mp4", ".avi", ".mov", ".mkv"],
+                    type="filepath"
+                )
+                submit_btn = gr.Button("🚀 Send", variant="primary", size="lg")
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+            with gr.Column(scale=2):
+                # Chat interface
+                chatbot_interface = gr.Chatbot(
+                    label="Chat History",
+                    height=600,
+                    bubble_full_width=False
+                )
+        # Event handlers
+        def process_input(text, pdf, audio, image, video, history):
+            return chatbot.chat(text, pdf, audio, image, video, history)
+        def clear_all():
+            return [], "", None, None, None, None
+        # Button events
+        submit_btn.click(
+            process_input,
+            inputs=[text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
+            outputs=[chatbot_interface, text_input]
+        )
+        clear_btn.click(
+            clear_all,
+            outputs=[chatbot_interface, text_input, pdf_input, audio_input, image_input, video_input]
+        )
+        # Enter key support
+        text_input.submit(
+            process_input,
+            inputs=[text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
+            outputs=[chatbot_interface, text_input]
+        )
+        # Examples
+        gr.Markdown("""
+        ### 🎯 Example Usage:
+        - Upload a PDF and ask "Summarize this document"
+        - Upload an image and ask "What do you see in this image?"
+        - Record audio and ask "What did I say?"
+        - Upload a video and ask "Describe what's happening"
+        - Combine multiple inputs: "Compare this image with the PDF content"
+        """)
+    return demo
+if __name__ == "__main__":
+    # Required packages (install with pip):
+    required_packages = [
+        "gradio",
+        "openai",
+        "PyPDF2",
+        "Pillow",
+        "SpeechRecognition",
+        "opencv-python",
+        "numpy"
+    ]
+    print("Required packages:", ", ".join(required_packages))
+    print("\nTo install: pip install " + " ".join(required_packages))
+    print("\nDon't forget to set your OPENROUTER_API_KEY environment variable!")
+    demo = create_interface()
+    demo.launch(
+        share=True
+    )