Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

2b051f4

verified ·

1 Parent(s): 11f4277

Create v1.txt

Browse files

Files changed (1) hide show

v1.txt +437 -0

v1.txt ADDED Viewed

	@@ -0,0 +1,437 @@

+import gradio as gr
+import base64
+import io
+import os
+from openai import OpenAI
+import PyPDF2
+import speech_recognition as sr
+import tempfile
+from pydub import AudioSegment
+from typing import List, Tuple, Optional
+class MultimodalChatbot:
+    def __init__(self, api_key: str):
+        self.client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+        )
+        self.model = "google/gemma-3n-e2b-it:free"
+        self.conversation_history = []
+    def extract_pdf_text(self, pdf_file) -> str:
+        """Extract text from PDF file"""
+        try:
+            if hasattr(pdf_file, 'name'):
+                pdf_path = pdf_file.name
+            else:
+                pdf_path = pdf_file
+            text = ""
+            with open(pdf_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page_num, page in enumerate(pdf_reader.pages):
+                    page_text = page.extract_text()
+                    if page_text.strip():
+                        text += f"Page {page_num + 1}:\n{page_text}\n\n"
+            return text.strip() if text.strip() else "No text could be extracted from this PDF."
+        except Exception as e:
+            return f"Error extracting PDF: {str(e)}"
+    def convert_audio_to_wav(self, audio_file) -> str:
+        """Convert audio file to WAV format for speech recognition"""
+        try:
+            if hasattr(audio_file, 'name'):
+                audio_path = audio_file.name
+            else:
+                audio_path = audio_file
+            file_ext = os.path.splitext(audio_path)[1].lower()
+            if file_ext == '.wav':
+                return audio_path
+            audio = AudioSegment.from_file(audio_path)
+            wav_path = tempfile.mktemp(suffix='.wav')
+            audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
+            return wav_path
+        except Exception as e:
+            raise Exception(f"Error converting audio: {str(e)}")
+    def transcribe_audio(self, audio_file) -> str:
+        """Transcribe audio file to text"""
+        try:
+            recognizer = sr.Recognizer()
+            wav_path = self.convert_audio_to_wav(audio_file)
+            with sr.AudioFile(wav_path) as source:
+                recognizer.adjust_for_ambient_noise(source, duration=0.2)
+                audio_data = recognizer.record(source)
+                try:
+                    text = recognizer.recognize_google(audio_data)
+                    return text
+                except sr.UnknownValueError:
+                    return "Could not understand the audio. Please try with clearer audio."
+                except sr.RequestError as e:
+                    try:
+                        text = recognizer.recognize_sphinx(audio_data)
+                        return text
+                    except:
+                        return f"Speech recognition service error: {str(e)}"
+        except Exception as e:
+            return f"Error transcribing audio: {str(e)}"
+    def create_multimodal_message(self,
+                                text_input: str = "",
+                                pdf_file=None,
+                                audio_file=None) -> dict:
+        """Create a multimodal message for the API"""
+        content_parts = []
+        processing_info = []
+        if text_input:
+            content_parts.append({"type": "text", "text": text_input})
+        if pdf_file is not None:
+            pdf_text = self.extract_pdf_text(pdf_file)
+            content_parts.append({
+                "type": "text",
+                "text": f"PDF Content:\n{pdf_text}"
+            })
+            processing_info.append("📄 PDF processed")
+        if audio_file is not None:
+            audio_text = self.transcribe_audio(audio_file)
+            content_parts.append({
+                "type": "text",
+                "text": f"Audio Transcription:\n{audio_text}"
+            })
+            processing_info.append("🎤 Audio transcribed")
+        return {"role": "user", "content": content_parts}, processing_info
+    def chat(self,
+             text_input: str = "",
+             pdf_file=None,
+             audio_file=None,
+             history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
+        """Main chat function"""
+        if history is None:
+            history = []
+        try:
+            user_message_parts = []
+            if text_input:
+                user_message_parts.append(f"Text: {text_input}")
+            if pdf_file:
+                user_message_parts.append("📄 PDF uploaded")
+            if audio_file:
+                user_message_parts.append("🎤 Audio uploaded")
+            user_display = " | ".join(user_message_parts)
+            user_message, processing_info = self.create_multimodal_message(
+                text_input, pdf_file, audio_file
+            )
+            if processing_info:
+                user_display += f"\n{' | '.join(processing_info)}"
+            messages = [user_message]
+            completion = self.client.chat.completions.create(
+                extra_headers={
+                    "HTTP-Referer": "https://multimodal-chatbot.local",
+                    "X-Title": "Multimodal Chatbot",
+                },
+                model=self.model,
+                messages=messages,
+                max_tokens=2048,
+                temperature=0.7
+            )
+            bot_response = completion.choices[0].message.content
+            history.append((user_display, bot_response))
+            return history, ""
+        except Exception as e:
+            error_msg = f"Error: {str(e)}"
+            history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
+            return history, ""
+def create_interface():
+    """Create the Gradio interface"""
+    with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🤖 Multimodal Chatbot with Gemma 3n
+        This chatbot can process multiple types of input:
+        - **Text**: Regular text messages
+        - **PDF**: Extract and analyze document content
+        - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
+        **Setup**: Enter your OpenRouter API key below to get started
+        """)
+        with gr.Row():
+            with gr.Column():
+                api_key_input = gr.Textbox(
+                    label="🔑 OpenRouter API Key",
+                    placeholder="Enter your OpenRouter API key here...",
+                    type="password",
+                    info="Your API key is not stored and only used for this session"
+                )
+                api_status = gr.Textbox(
+                    label="Connection Status",
+                    value="❌ API Key not provided",
+                    interactive=False
+                )
+        with gr.Tabs():
+            with gr.TabItem("💬 Text Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        text_input = gr.Textbox(
+                            label="💬 Text Input",
+                            placeholder="Type your message here...",
+                            lines=5
+                        )
+                        text_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        text_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        text_chatbot = gr.Chatbot(
+                            label="Text Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            with gr.TabItem("📄 PDF Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        pdf_input = gr.File(
+                            label="📄 PDF Upload",
+                            file_types=[".pdf"],
+                            type="filepath"
+                        )
+                        pdf_text_input = gr.Textbox(
+                            label="💬 Question about PDF",
+                            placeholder="Ask something about the PDF...",
+                            lines=3
+                        )
+                        pdf_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        pdf_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        pdf_chatbot = gr.Chatbot(
+                            label="PDF Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            with gr.TabItem("🎤 Audio Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        audio_input = gr.File(
+                            label="🎤 Audio Upload",
+                            file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
+                            type="filepath"
+                        )
+                        audio_text_input = gr.Textbox(
+                            label="💬 Question about Audio",
+                            placeholder="Ask something about the audio...",
+                            lines=3
+                        )
+                        audio_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        audio_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        audio_chatbot = gr.Chatbot(
+                            label="Audio Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            with gr.TabItem("🌟 Combined Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        combined_text_input = gr.Textbox(
+                            label="💬 Text Input",
+                            placeholder="Type your message here...",
+                            lines=3
+                        )
+                        combined_pdf_input = gr.File(
+                            label="📄 PDF Upload",
+                            file_types=[".pdf"],
+                            type="filepath"
+                        )
+                        combined_audio_input = gr.File(
+                            label="🎤 Audio Upload",
+                            file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
+                            type="filepath"
+                        )
+                        combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
+                        combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
+                    with gr.Column(scale=2):
+                        combined_chatbot = gr.Chatbot(
+                            label="Combined Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+        def validate_api_key(api_key):
+            if not api_key or len(api_key.strip()) == 0:
+                return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(4)]
+            try:
+                test_client = OpenAI(
+                    base_url="https://openrouter.ai/api/v1",
+                    api_key=api_key.strip(),
+                )
+                return "✅ API Key validated successfully", *[gr.update(interactive=True) for _ in range(4)]
+            except Exception as e:
+                return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(4)]
+        def process_text_input(api_key, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, history=history)
+        def process_pdf_input(api_key, pdf, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
+        def process_audio_input(api_key, audio, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, audio_file=audio, history=history)
+        def process_combined_input(api_key, text, pdf, audio, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text, pdf, audio, history)
+        def clear_chat():
+            return [], ""
+        def clear_all_inputs():
+            return [], "", None, None
+        api_key_input.change(
+            validate_api_key,
+            inputs=[api_key_input],
+            outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, combined_submit_btn]
+        )
+        text_submit_btn.click(
+            process_text_input,
+            inputs=[api_key_input, text_input, text_chatbot],
+            outputs=[text_chatbot, text_input]
+        )
+        text_input.submit(
+            process_text_input,
+            inputs=[api_key_input, text_input, text_chatbot],
+            outputs=[text_chatbot, text_input]
+        )
+        text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
+        pdf_submit_btn.click(
+            process_pdf_input,
+            inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
+            outputs=[pdf_chatbot, pdf_text_input]
+        )
+        pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
+        audio_submit_btn.click(
+            process_audio_input,
+            inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
+            outputs=[audio_chatbot, audio_text_input]
+        )
+        audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
+        combined_submit_btn.click(
+            process_combined_input,
+            inputs=[api_key_input, combined_text_input, combined_pdf_input,
+                   combined_audio_input, combined_chatbot],
+            outputs=[combined_chatbot, combined_text_input]
+        )
+        combined_clear_btn.click(clear_all_inputs,
+                               outputs=[combined_chatbot, combined_text_input,
+                                      combined_pdf_input, combined_audio_input])
+        gr.Markdown("""
+        ### 🎯 How to Use Each Tab:
+        **💬 Text Chat**: Simple text conversations with the AI
+        **📄 PDF Chat**: Upload a PDF and ask questions about its content
+        **🎤 Audio Chat**: Upload audio files for transcription and analysis
+        - Supports: WAV, MP3, M4A, FLAC, OGG formats
+        - Best results with clear speech and minimal background noise
+        **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
+        ### 🔑 Getting an API Key:
+        1. Go to [OpenRouter.ai](https://openrouter.ai)
+        2. Sign up for an account
+        3. Navigate to the API Keys section
+        4. Create a new API key
+        5. Copy and paste it in the field above
+        ### ⚠️ Current Limitations:
+        - Audio transcription requires internet connection for best results
+        - Large files may take longer to process
+        """)
+    return demo
+if __name__ == "__main__":
+    required_packages = [
+        "gradio",
+        "openai",
+        "PyPDF2",
+        "SpeechRecognition",
+        "pydub"
+    ]
+    print("🚀 Multimodal Chatbot with Gemma 3n")
+    print("=" * 50)
+    print("Required packages:", ", ".join(required_packages))
+    print("\n📦 To install: pip install " + " ".join(required_packages))
+    print("\n🎤 For audio processing, you may also need:")
+    print("   - ffmpeg (for audio conversion)")
+    print("   - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)")
+    print("\n🔑 Get your API key from: https://openrouter.ai")
+    print("💡 Enter your API key in the web interface when it loads")
+    demo = create_interface()
+    demo.launch(
+        share=True
+    )