shukdevdattaEX's picture
Update app.py
11f4277 verified
import gradio as gr
import base64
import io
import os
from openai import OpenAI
import PyPDF2
import speech_recognition as sr
import tempfile
from pydub import AudioSegment
from typing import List, Tuple, Optional
class MultimodalChatbot:
def __init__(self, api_key: str):
self.client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
self.model = "google/gemma-3n-e2b-it:free"
self.conversation_history = []
def extract_pdf_text(self, pdf_file) -> str:
"""Extract text from PDF file"""
try:
if hasattr(pdf_file, 'name'):
pdf_path = pdf_file.name
else:
pdf_path = pdf_file
text = ""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
if page_text.strip():
text += f"Page {page_num + 1}:\n{page_text}\n\n"
return text.strip() if text.strip() else "No text could be extracted from this PDF."
except Exception as e:
return f"Error extracting PDF: {str(e)}"
def convert_audio_to_wav(self, audio_file) -> str:
"""Convert audio file to WAV format for speech recognition"""
try:
if hasattr(audio_file, 'name'):
audio_path = audio_file.name
else:
audio_path = audio_file
file_ext = os.path.splitext(audio_path)[1].lower()
if file_ext == '.wav':
return audio_path
audio = AudioSegment.from_file(audio_path)
wav_path = tempfile.mktemp(suffix='.wav')
audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
return wav_path
except Exception as e:
raise Exception(f"Error converting audio: {str(e)}")
def transcribe_audio(self, audio_file) -> str:
"""Transcribe audio file to text"""
try:
recognizer = sr.Recognizer()
wav_path = self.convert_audio_to_wav(audio_file)
with sr.AudioFile(wav_path) as source:
recognizer.adjust_for_ambient_noise(source, duration=0.2)
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Could not understand the audio. Please try with clearer audio."
except sr.RequestError as e:
try:
text = recognizer.recognize_sphinx(audio_data)
return text
except:
return f"Speech recognition service error: {str(e)}"
except Exception as e:
return f"Error transcribing audio: {str(e)}"
def transcribe_recorded_audio(self, audio_data) -> str:
"""Transcribe recorded audio to text"""
try:
recognizer = sr.Recognizer()
wav_path = tempfile.mktemp(suffix='.wav')
# Convert raw audio data to WAV
audio = AudioSegment.from_file(io.BytesIO(audio_data), format="wav")
audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
with sr.AudioFile(wav_path) as source:
recognizer.adjust_for_ambient_noise(source, duration=0.2)
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Could not understand the recorded audio. Please try with clearer audio."
except sr.RequestError as e:
try:
text = recognizer.recognize_sphinx(audio_data)
return text
except:
return f"Speech recognition service error: {str(e)}"
except Exception as e:
return f"Error transcribing recorded audio: {str(e)}"
def create_multimodal_message(self,
text_input: str = "",
pdf_file=None,
audio_file=None,
recorded_audio=None) -> dict:
"""Create a multimodal message for the API"""
content_parts = []
processing_info = []
if text_input:
content_parts.append({"type": "text", "text": text_input})
if pdf_file is not None:
pdf_text = self.extract_pdf_text(pdf_file)
content_parts.append({
"type": "text",
"text": f"PDF Content:\n{pdf_text}"
})
processing_info.append("πŸ“„ PDF processed")
if audio_file is not None:
audio_text = self.transcribe_audio(audio_file)
content_parts.append({
"type": "text",
"text": f"Audio Transcription:\n{audio_text}"
})
processing_info.append("🎀 Audio transcribed")
if recorded_audio is not None:
audio_text = self.transcribe_recorded_audio(recorded_audio)
content_parts.append({
"type": "text",
"text": f"Recorded Audio Transcription:\n{audio_text}"
})
processing_info.append("πŸŽ™οΈ Recorded audio transcribed")
return {"role": "user", "content": content_parts}, processing_info
def chat(self,
text_input: str = "",
pdf_file=None,
audio_file=None,
recorded_audio=None,
history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
"""Main chat function"""
if history is None:
history = []
try:
user_message_parts = []
if text_input:
user_message_parts.append(f"Text: {text_input}")
if pdf_file:
user_message_parts.append("πŸ“„ PDF uploaded")
if audio_file:
user_message_parts.append("🎀 Audio uploaded")
if recorded_audio:
user_message_parts.append("πŸŽ™οΈ Recorded audio")
user_display = " | ".join(user_message_parts)
user_message, processing_info = self.create_multimodal_message(
text_input, pdf_file, audio_file, recorded_audio
)
if processing_info:
user_display += f"\n{' | '.join(processing_info)}"
messages = [user_message]
completion = self.client.chat.completions.create(
extra_headers={
"HTTP-Referer": "https://multimodal-chatbot.local",
"X-Title": "Multimodal Chatbot",
},
model=self.model,
messages=messages,
max_tokens=2048,
temperature=0.7
)
bot_response = completion.choices[0].message.content
history.append((user_display, bot_response))
return history, ""
except Exception as e:
error_msg = f"Error: {str(e)}"
history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
return history, ""
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ€– Multimodal Chatbot with Gemma 3n
This chatbot can process multiple types of input:
- **Text**: Regular text messages
- **PDF**: Extract and analyze document content
- **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC, recorded audio)
**Setup**: Enter your OpenRouter API key below to get started
""")
with gr.Row():
with gr.Column():
api_key_input = gr.Textbox(
label="πŸ”‘ OpenRouter API Key",
placeholder="Enter your OpenRouter API key here...",
type="password",
info="Your API key is not stored and only used for this session"
)
api_status = gr.Textbox(
label="Connection Status",
value="❌ API Key not provided",
interactive=False
)
with gr.Tabs():
with gr.TabItem("πŸ’¬ Text Chat"):
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="πŸ’¬ Text Input",
placeholder="Type your message here...",
lines=5
)
text_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
text_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
with gr.Column(scale=2):
text_chatbot = gr.Chatbot(
label="Text Chat History",
height=600,
bubble_full_width=False,
show_copy_button=True
)
with gr.TabItem("πŸ“„ PDF Chat"):
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="πŸ“„ PDF Upload",
file_types=[".pdf"],
type="filepath"
)
pdf_text_input = gr.Textbox(
label="πŸ’¬ Question about PDF",
placeholder="Ask something about the PDF...",
lines=3
)
pdf_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
pdf_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
with gr.Column(scale=2):
pdf_chatbot = gr.Chatbot(
label="PDF Chat History",
height=600,
bubble_full_width=False,
show_copy_button=True
)
with gr.TabItem("🎀 Audio Chat"):
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="🎀 Audio Upload",
type="filepath"
)
audio_recorder = gr.Microphone(
label="πŸŽ™οΈ Record Audio",
type="numpy"
)
audio_text_input = gr.Textbox(
label="πŸ’¬ Question about Audio",
placeholder="Ask something about the audio...",
lines=3
)
audio_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
audio_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
with gr.Column(scale=2):
audio_chatbot = gr.Chatbot(
label="Audio Chat History",
height=600,
bubble_full_width=False,
show_copy_button=True
)
with gr.TabItem("🌟 Combined Chat"):
with gr.Row():
with gr.Column(scale=1):
combined_text_input = gr.Textbox(
label="πŸ’¬ Text Input",
placeholder="Type your message here...",
lines=3
)
combined_pdf_input = gr.File(
label="πŸ“„ PDF Upload",
file_types=[".pdf"],
type="filepath"
)
combined_audio_input = gr.Audio(
label="🎀 Audio Upload",
type="filepath"
)
combined_audio_recorder = gr.Microphone(
label="πŸŽ™οΈ Record Audio",
type="numpy"
)
combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
with gr.Column(scale=2):
combined_chatbot = gr.Chatbot(
label="Combined Chat History",
height=600,
bubble_full_width=False,
show_copy_button=True
)
def validate_api_key(api_key):
if not api_key or len(api_key.strip()) == 0:
return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(4)]
try:
test_client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key.strip(),
)
return "βœ… API Key validated successfully", *[gr.update(interactive=True) for _ in range(4)]
except Exception as e:
return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(4)]
def process_text_input(api_key, text, history):
if not api_key or len(api_key.strip()) == 0:
if history is None:
history = []
history.append(("Error", "❌ Please provide a valid API key first"))
return history, ""
chatbot = MultimodalChatbot(api_key.strip())
return chatbot.chat(text_input=text, history=history)
def process_pdf_input(api_key, pdf, text, history):
if not api_key or len(api_key.strip()) == 0:
if history is None:
history = []
history.append(("Error", "❌ Please provide a valid API key first"))
return history, ""
chatbot = MultimodalChatbot(api_key.strip())
return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
def process_audio_input(api_key, audio, recorded_audio, text, history):
if not api_key or len(api_key.strip()) == 0:
if history is None:
history = []
history.append(("Error", "❌ Please provide a valid API key first"))
return history, ""
chatbot = MultimodalChatbot(api_key.strip())
return chatbot.chat(text_input=text, audio_file=audio, recorded_audio=recorded_audio, history=history)
def process_combined_input(api_key, text, pdf, audio, recorded_audio, history):
if not api_key or len(api_key.strip()) == 0:
if history is None:
history = []
history.append(("Error", "❌ Please provide a valid API key first"))
return history, ""
chatbot = MultimodalChatbot(api_key.strip())
return chatbot.chat(text, pdf, audio, recorded_audio, history)
def clear_chat():
return [], ""
def clear_audio_inputs():
return [], "", None, None
def clear_all_inputs():
return [], "", None, None, None
api_key_input.change(
validate_api_key,
inputs=[api_key_input],
outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, combined_submit_btn]
)
text_submit_btn.click(
process_text_input,
inputs=[api_key_input, text_input, text_chatbot],
outputs=[text_chatbot, text_input]
)
text_input.submit(
process_text_input,
inputs=[api_key_input, text_input, text_chatbot],
outputs=[text_chatbot, text_input]
)
text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
pdf_submit_btn.click(
process_pdf_input,
inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
outputs=[pdf_chatbot, pdf_text_input]
)
pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
audio_submit_btn.click(
process_audio_input,
inputs=[api_key_input, audio_input, audio_recorder, audio_text_input, audio_chatbot],
outputs=[audio_chatbot, audio_text_input]
)
audio_clear_btn.click(clear_audio_inputs, outputs=[audio_chatbot, audio_text_input, audio_input, audio_recorder])
combined_submit_btn.click(
process_combined_input,
inputs=[api_key_input, combined_text_input, combined_pdf_input,
combined_audio_input, combined_audio_recorder, combined_chatbot],
outputs=[combined_chatbot, combined_text_input]
)
combined_clear_btn.click(clear_all_inputs,
outputs=[combined_chatbot, combined_text_input,
combined_pdf_input, combined_audio_input,
combined_audio_recorder])
gr.Markdown("""
### 🎯 How to Use Each Tab:
**πŸ’¬ Text Chat**: Simple text conversations with the AI
**πŸ“„ PDF Chat**: Upload a PDF and ask questions about its content
**🎀 Audio Chat**: Upload or record audio files for transcription and analysis
- Supports: WAV, MP3, M4A, FLAC, OGG formats for uploads
- Recorded audio is processed directly from your microphone
- Best results with clear speech and minimal background noise
**🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
### πŸ”‘ Getting an API Key:
1. Go to [OpenRouter.ai](https://openrouter.ai)
2. Sign up for an account
3. Navigate to the API Keys section
4. Create a new API key
5. Copy and paste it in the field above
### ⚠️ Current Limitations:
- Audio transcription requires internet connection for best results
- Large files may take longer to process
- Recorded audio quality depends on your microphone
""")
return demo
if __name__ == "__main__":
required_packages = [
"gradio",
"openai",
"PyPDF2",
"SpeechRecognition",
"pydub"
]
print("πŸš€ Multimodal Chatbot with Gemma 3n")
print("=" * 50)
print("Required packages:", ", ".join(required_packages))
print("\nπŸ“¦ To install: pip install " + " ".join(required_packages))
print("\n🎀 For audio processing, you may also need:")
print(" - ffmpeg (for audio conversion)")
print(" - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)")
print("\nπŸ”‘ Get your API key from: https://openrouter.ai")
print("πŸ’‘ Enter your API key in the web interface when it carries")
demo = create_interface()
demo.launch(
share=True
)