|
import gradio as gr |
|
import base64 |
|
import io |
|
import os |
|
from openai import OpenAI |
|
import PyPDF2 |
|
import speech_recognition as sr |
|
import tempfile |
|
from pydub import AudioSegment |
|
from typing import List, Tuple, Optional |
|
|
|
class MultimodalChatbot: |
|
def __init__(self, api_key: str): |
|
self.client = OpenAI( |
|
base_url="https://openrouter.ai/api/v1", |
|
api_key=api_key, |
|
) |
|
self.model = "google/gemma-3n-e2b-it:free" |
|
self.conversation_history = [] |
|
|
|
def extract_pdf_text(self, pdf_file) -> str: |
|
"""Extract text from PDF file""" |
|
try: |
|
if hasattr(pdf_file, 'name'): |
|
pdf_path = pdf_file.name |
|
else: |
|
pdf_path = pdf_file |
|
|
|
text = "" |
|
with open(pdf_path, 'rb') as file: |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
for page_num, page in enumerate(pdf_reader.pages): |
|
page_text = page.extract_text() |
|
if page_text.strip(): |
|
text += f"Page {page_num + 1}:\n{page_text}\n\n" |
|
return text.strip() if text.strip() else "No text could be extracted from this PDF." |
|
except Exception as e: |
|
return f"Error extracting PDF: {str(e)}" |
|
|
|
def convert_audio_to_wav(self, audio_file) -> str: |
|
"""Convert audio file to WAV format for speech recognition""" |
|
try: |
|
if hasattr(audio_file, 'name'): |
|
audio_path = audio_file.name |
|
else: |
|
audio_path = audio_file |
|
|
|
file_ext = os.path.splitext(audio_path)[1].lower() |
|
if file_ext == '.wav': |
|
return audio_path |
|
|
|
audio = AudioSegment.from_file(audio_path) |
|
wav_path = tempfile.mktemp(suffix='.wav') |
|
audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"]) |
|
return wav_path |
|
except Exception as e: |
|
raise Exception(f"Error converting audio: {str(e)}") |
|
|
|
def transcribe_audio(self, audio_file) -> str: |
|
"""Transcribe audio file to text""" |
|
try: |
|
recognizer = sr.Recognizer() |
|
wav_path = self.convert_audio_to_wav(audio_file) |
|
|
|
with sr.AudioFile(wav_path) as source: |
|
recognizer.adjust_for_ambient_noise(source, duration=0.2) |
|
audio_data = recognizer.record(source) |
|
|
|
try: |
|
text = recognizer.recognize_google(audio_data) |
|
return text |
|
except sr.UnknownValueError: |
|
return "Could not understand the audio. Please try with clearer audio." |
|
except sr.RequestError as e: |
|
try: |
|
text = recognizer.recognize_sphinx(audio_data) |
|
return text |
|
except: |
|
return f"Speech recognition service error: {str(e)}" |
|
except Exception as e: |
|
return f"Error transcribing audio: {str(e)}" |
|
|
|
def transcribe_recorded_audio(self, audio_data) -> str: |
|
"""Transcribe recorded audio to text""" |
|
try: |
|
recognizer = sr.Recognizer() |
|
wav_path = tempfile.mktemp(suffix='.wav') |
|
|
|
|
|
audio = AudioSegment.from_file(io.BytesIO(audio_data), format="wav") |
|
audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"]) |
|
|
|
with sr.AudioFile(wav_path) as source: |
|
recognizer.adjust_for_ambient_noise(source, duration=0.2) |
|
audio_data = recognizer.record(source) |
|
|
|
try: |
|
text = recognizer.recognize_google(audio_data) |
|
return text |
|
except sr.UnknownValueError: |
|
return "Could not understand the recorded audio. Please try with clearer audio." |
|
except sr.RequestError as e: |
|
try: |
|
text = recognizer.recognize_sphinx(audio_data) |
|
return text |
|
except: |
|
return f"Speech recognition service error: {str(e)}" |
|
except Exception as e: |
|
return f"Error transcribing recorded audio: {str(e)}" |
|
|
|
def create_multimodal_message(self, |
|
text_input: str = "", |
|
pdf_file=None, |
|
audio_file=None, |
|
recorded_audio=None) -> dict: |
|
"""Create a multimodal message for the API""" |
|
content_parts = [] |
|
processing_info = [] |
|
|
|
if text_input: |
|
content_parts.append({"type": "text", "text": text_input}) |
|
|
|
if pdf_file is not None: |
|
pdf_text = self.extract_pdf_text(pdf_file) |
|
content_parts.append({ |
|
"type": "text", |
|
"text": f"PDF Content:\n{pdf_text}" |
|
}) |
|
processing_info.append("π PDF processed") |
|
|
|
if audio_file is not None: |
|
audio_text = self.transcribe_audio(audio_file) |
|
content_parts.append({ |
|
"type": "text", |
|
"text": f"Audio Transcription:\n{audio_text}" |
|
}) |
|
processing_info.append("π€ Audio transcribed") |
|
|
|
if recorded_audio is not None: |
|
audio_text = self.transcribe_recorded_audio(recorded_audio) |
|
content_parts.append({ |
|
"type": "text", |
|
"text": f"Recorded Audio Transcription:\n{audio_text}" |
|
}) |
|
processing_info.append("ποΈ Recorded audio transcribed") |
|
|
|
return {"role": "user", "content": content_parts}, processing_info |
|
|
|
def chat(self, |
|
text_input: str = "", |
|
pdf_file=None, |
|
audio_file=None, |
|
recorded_audio=None, |
|
history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]: |
|
"""Main chat function""" |
|
if history is None: |
|
history = [] |
|
|
|
try: |
|
user_message_parts = [] |
|
if text_input: |
|
user_message_parts.append(f"Text: {text_input}") |
|
if pdf_file: |
|
user_message_parts.append("π PDF uploaded") |
|
if audio_file: |
|
user_message_parts.append("π€ Audio uploaded") |
|
if recorded_audio: |
|
user_message_parts.append("ποΈ Recorded audio") |
|
|
|
user_display = " | ".join(user_message_parts) |
|
|
|
user_message, processing_info = self.create_multimodal_message( |
|
text_input, pdf_file, audio_file, recorded_audio |
|
) |
|
|
|
if processing_info: |
|
user_display += f"\n{' | '.join(processing_info)}" |
|
|
|
messages = [user_message] |
|
|
|
completion = self.client.chat.completions.create( |
|
extra_headers={ |
|
"HTTP-Referer": "https://multimodal-chatbot.local", |
|
"X-Title": "Multimodal Chatbot", |
|
}, |
|
model=self.model, |
|
messages=messages, |
|
max_tokens=2048, |
|
temperature=0.7 |
|
) |
|
|
|
bot_response = completion.choices[0].message.content |
|
history.append((user_display, bot_response)) |
|
|
|
return history, "" |
|
|
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
history.append((user_display if 'user_display' in locals() else "Error in input", error_msg)) |
|
return history, "" |
|
|
|
def create_interface(): |
|
"""Create the Gradio interface""" |
|
with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo: |
|
gr.Markdown(""" |
|
# π€ Multimodal Chatbot with Gemma 3n |
|
|
|
This chatbot can process multiple types of input: |
|
- **Text**: Regular text messages |
|
- **PDF**: Extract and analyze document content |
|
- **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC, recorded audio) |
|
|
|
**Setup**: Enter your OpenRouter API key below to get started |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
api_key_input = gr.Textbox( |
|
label="π OpenRouter API Key", |
|
placeholder="Enter your OpenRouter API key here...", |
|
type="password", |
|
info="Your API key is not stored and only used for this session" |
|
) |
|
api_status = gr.Textbox( |
|
label="Connection Status", |
|
value="β API Key not provided", |
|
interactive=False |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("π¬ Text Chat"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
text_input = gr.Textbox( |
|
label="π¬ Text Input", |
|
placeholder="Type your message here...", |
|
lines=5 |
|
) |
|
text_submit_btn = gr.Button("π Send", variant="primary", size="lg", interactive=False) |
|
text_clear_btn = gr.Button("ποΈ Clear", variant="secondary") |
|
|
|
with gr.Column(scale=2): |
|
text_chatbot = gr.Chatbot( |
|
label="Text Chat History", |
|
height=600, |
|
bubble_full_width=False, |
|
show_copy_button=True |
|
) |
|
|
|
with gr.TabItem("π PDF Chat"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
pdf_input = gr.File( |
|
label="π PDF Upload", |
|
file_types=[".pdf"], |
|
type="filepath" |
|
) |
|
pdf_text_input = gr.Textbox( |
|
label="π¬ Question about PDF", |
|
placeholder="Ask something about the PDF...", |
|
lines=3 |
|
) |
|
pdf_submit_btn = gr.Button("π Send", variant="primary", size="lg", interactive=False) |
|
pdf_clear_btn = gr.Button("ποΈ Clear", variant="secondary") |
|
|
|
with gr.Column(scale=2): |
|
pdf_chatbot = gr.Chatbot( |
|
label="PDF Chat History", |
|
height=600, |
|
bubble_full_width=False, |
|
show_copy_button=True |
|
) |
|
|
|
with gr.TabItem("π€ Audio Chat"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio( |
|
label="π€ Audio Upload", |
|
type="filepath" |
|
) |
|
audio_recorder = gr.Microphone( |
|
label="ποΈ Record Audio", |
|
type="numpy" |
|
) |
|
audio_text_input = gr.Textbox( |
|
label="π¬ Question about Audio", |
|
placeholder="Ask something about the audio...", |
|
lines=3 |
|
) |
|
audio_submit_btn = gr.Button("π Send", variant="primary", size="lg", interactive=False) |
|
audio_clear_btn = gr.Button("ποΈ Clear", variant="secondary") |
|
|
|
with gr.Column(scale=2): |
|
audio_chatbot = gr.Chatbot( |
|
label="Audio Chat History", |
|
height=600, |
|
bubble_full_width=False, |
|
show_copy_button=True |
|
) |
|
|
|
with gr.TabItem("π Combined Chat"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
combined_text_input = gr.Textbox( |
|
label="π¬ Text Input", |
|
placeholder="Type your message here...", |
|
lines=3 |
|
) |
|
combined_pdf_input = gr.File( |
|
label="π PDF Upload", |
|
file_types=[".pdf"], |
|
type="filepath" |
|
) |
|
combined_audio_input = gr.Audio( |
|
label="π€ Audio Upload", |
|
type="filepath" |
|
) |
|
combined_audio_recorder = gr.Microphone( |
|
label="ποΈ Record Audio", |
|
type="numpy" |
|
) |
|
combined_submit_btn = gr.Button("π Send All", variant="primary", size="lg", interactive=False) |
|
combined_clear_btn = gr.Button("ποΈ Clear All", variant="secondary") |
|
|
|
with gr.Column(scale=2): |
|
combined_chatbot = gr.Chatbot( |
|
label="Combined Chat History", |
|
height=600, |
|
bubble_full_width=False, |
|
show_copy_button=True |
|
) |
|
|
|
def validate_api_key(api_key): |
|
if not api_key or len(api_key.strip()) == 0: |
|
return "β API Key not provided", *[gr.update(interactive=False) for _ in range(4)] |
|
|
|
try: |
|
test_client = OpenAI( |
|
base_url="https://openrouter.ai/api/v1", |
|
api_key=api_key.strip(), |
|
) |
|
return "β
API Key validated successfully", *[gr.update(interactive=True) for _ in range(4)] |
|
except Exception as e: |
|
return f"β API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(4)] |
|
|
|
def process_text_input(api_key, text, history): |
|
if not api_key or len(api_key.strip()) == 0: |
|
if history is None: |
|
history = [] |
|
history.append(("Error", "β Please provide a valid API key first")) |
|
return history, "" |
|
|
|
chatbot = MultimodalChatbot(api_key.strip()) |
|
return chatbot.chat(text_input=text, history=history) |
|
|
|
def process_pdf_input(api_key, pdf, text, history): |
|
if not api_key or len(api_key.strip()) == 0: |
|
if history is None: |
|
history = [] |
|
history.append(("Error", "β Please provide a valid API key first")) |
|
return history, "" |
|
|
|
chatbot = MultimodalChatbot(api_key.strip()) |
|
return chatbot.chat(text_input=text, pdf_file=pdf, history=history) |
|
|
|
def process_audio_input(api_key, audio, recorded_audio, text, history): |
|
if not api_key or len(api_key.strip()) == 0: |
|
if history is None: |
|
history = [] |
|
history.append(("Error", "β Please provide a valid API key first")) |
|
return history, "" |
|
|
|
chatbot = MultimodalChatbot(api_key.strip()) |
|
return chatbot.chat(text_input=text, audio_file=audio, recorded_audio=recorded_audio, history=history) |
|
|
|
def process_combined_input(api_key, text, pdf, audio, recorded_audio, history): |
|
if not api_key or len(api_key.strip()) == 0: |
|
if history is None: |
|
history = [] |
|
history.append(("Error", "β Please provide a valid API key first")) |
|
return history, "" |
|
|
|
chatbot = MultimodalChatbot(api_key.strip()) |
|
return chatbot.chat(text, pdf, audio, recorded_audio, history) |
|
|
|
def clear_chat(): |
|
return [], "" |
|
|
|
def clear_audio_inputs(): |
|
return [], "", None, None |
|
|
|
def clear_all_inputs(): |
|
return [], "", None, None, None |
|
|
|
api_key_input.change( |
|
validate_api_key, |
|
inputs=[api_key_input], |
|
outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, combined_submit_btn] |
|
) |
|
|
|
text_submit_btn.click( |
|
process_text_input, |
|
inputs=[api_key_input, text_input, text_chatbot], |
|
outputs=[text_chatbot, text_input] |
|
) |
|
text_input.submit( |
|
process_text_input, |
|
inputs=[api_key_input, text_input, text_chatbot], |
|
outputs=[text_chatbot, text_input] |
|
) |
|
text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input]) |
|
|
|
pdf_submit_btn.click( |
|
process_pdf_input, |
|
inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot], |
|
outputs=[pdf_chatbot, pdf_text_input] |
|
) |
|
pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input]) |
|
|
|
audio_submit_btn.click( |
|
process_audio_input, |
|
inputs=[api_key_input, audio_input, audio_recorder, audio_text_input, audio_chatbot], |
|
outputs=[audio_chatbot, audio_text_input] |
|
) |
|
audio_clear_btn.click(clear_audio_inputs, outputs=[audio_chatbot, audio_text_input, audio_input, audio_recorder]) |
|
|
|
combined_submit_btn.click( |
|
process_combined_input, |
|
inputs=[api_key_input, combined_text_input, combined_pdf_input, |
|
combined_audio_input, combined_audio_recorder, combined_chatbot], |
|
outputs=[combined_chatbot, combined_text_input] |
|
) |
|
combined_clear_btn.click(clear_all_inputs, |
|
outputs=[combined_chatbot, combined_text_input, |
|
combined_pdf_input, combined_audio_input, |
|
combined_audio_recorder]) |
|
|
|
gr.Markdown(""" |
|
### π― How to Use Each Tab: |
|
|
|
**π¬ Text Chat**: Simple text conversations with the AI |
|
|
|
**π PDF Chat**: Upload a PDF and ask questions about its content |
|
|
|
**π€ Audio Chat**: Upload or record audio files for transcription and analysis |
|
- Supports: WAV, MP3, M4A, FLAC, OGG formats for uploads |
|
- Recorded audio is processed directly from your microphone |
|
- Best results with clear speech and minimal background noise |
|
|
|
**π Combined Chat**: Use multiple input types together for comprehensive analysis |
|
|
|
### π Getting an API Key: |
|
1. Go to [OpenRouter.ai](https://openrouter.ai) |
|
2. Sign up for an account |
|
3. Navigate to the API Keys section |
|
4. Create a new API key |
|
5. Copy and paste it in the field above |
|
|
|
### β οΈ Current Limitations: |
|
- Audio transcription requires internet connection for best results |
|
- Large files may take longer to process |
|
- Recorded audio quality depends on your microphone |
|
""") |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
required_packages = [ |
|
"gradio", |
|
"openai", |
|
"PyPDF2", |
|
"SpeechRecognition", |
|
"pydub" |
|
] |
|
|
|
print("π Multimodal Chatbot with Gemma 3n") |
|
print("=" * 50) |
|
print("Required packages:", ", ".join(required_packages)) |
|
print("\nπ¦ To install: pip install " + " ".join(required_packages)) |
|
print("\nπ€ For audio processing, you may also need:") |
|
print(" - ffmpeg (for audio conversion)") |
|
print(" - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)") |
|
print("\nπ Get your API key from: https://openrouter.ai") |
|
print("π‘ Enter your API key in the web interface when it carries") |
|
|
|
demo = create_interface() |
|
demo.launch( |
|
share=True |
|
) |