shukdevdattaEX commited on
Commit
dcec17f
Β·
verified Β·
1 Parent(s): dc31593

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -258
app.py CHANGED
@@ -4,17 +4,10 @@ import io
4
  import os
5
  from openai import OpenAI
6
  import PyPDF2
7
- from PIL import Image
8
  import speech_recognition as sr
9
  import tempfile
10
- import cv2
11
- import numpy as np
12
- from typing import List, Tuple, Optional
13
- import json
14
- import pydub
15
  from pydub import AudioSegment
16
- from transformers import pipeline
17
- import torch
18
 
19
  class MultimodalChatbot:
20
  def __init__(self, api_key: str):
@@ -22,54 +15,23 @@ class MultimodalChatbot:
22
  base_url="https://openrouter.ai/api/v1",
23
  api_key=api_key,
24
  )
25
- self.model = "google/gemma-2-9b-it:free"
26
  self.conversation_history = []
27
- # Initialize the pipeline for image-text-to-text processing
28
- try:
29
- self.pipe = pipeline(
30
- "image-captioning",
31
- model="Salesforce/blip-image-captioning-base",
32
- device="cpu", # Optimized for CPU in HF Spaces
33
- torch_dtype=torch.float32, # Use float32 for CPU compatibility
34
- )
35
- print("Image captioning pipeline initialized successfully")
36
- except Exception as e:
37
- print(f"Error initializing image captioning pipeline: {e}")
38
- self.pipe = None
39
-
40
- def encode_image_to_base64(self, image) -> str:
41
- """Convert PIL Image or file path to base64 string"""
42
- try:
43
- if isinstance(image, str):
44
- with open(image, "rb") as img_file:
45
- return base64.b64encode(img_file.read()).decode('utf-8')
46
- elif isinstance(image, Image.Image):
47
- buffered = io.BytesIO()
48
- if image.mode == 'RGBA':
49
- image = image.convert('RGB')
50
- image.save(buffered, format="JPEG", quality=85)
51
- return base64.b64encode(buffered.getvalue()).decode('utf-8')
52
- else:
53
- raise ValueError("Invalid image input")
54
- except Exception as e:
55
- return f"Error encoding image: {str(e)}"
56
-
57
  def extract_pdf_text(self, pdf_file) -> str:
58
  """Extract text from PDF file"""
59
  try:
60
- if isinstance(pdf_file, str):
61
- pdf_path = pdf_file
62
- elif hasattr(pdf_file, 'name'):
63
  pdf_path = pdf_file.name
64
  else:
65
- raise ValueError("Invalid PDF file input")
66
 
67
  text = ""
68
  with open(pdf_path, 'rb') as file:
69
  pdf_reader = PyPDF2.PdfReader(file)
70
  for page_num, page in enumerate(pdf_reader.pages):
71
  page_text = page.extract_text()
72
- if page_text and page_text.strip():
73
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
74
  return text.strip() if text.strip() else "No text could be extracted from this PDF."
75
  except Exception as e:
@@ -78,12 +40,10 @@ class MultimodalChatbot:
78
  def convert_audio_to_wav(self, audio_file) -> str:
79
  """Convert audio file to WAV format for speech recognition"""
80
  try:
81
- if isinstance(audio_file, str):
82
- audio_path = audio_file
83
- elif hasattr(audio_file, 'name'):
84
  audio_path = audio_file.name
85
  else:
86
- raise ValueError("Invalid audio file input")
87
 
88
  file_ext = os.path.splitext(audio_path)[1].lower()
89
  if file_ext == '.wav':
@@ -94,7 +54,7 @@ class MultimodalChatbot:
94
  audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
95
  return wav_path
96
  except Exception as e:
97
- return f"Error converting audio: {str(e)}"
98
 
99
  def transcribe_audio(self, audio_file) -> str:
100
  """Transcribe audio file to text"""
@@ -105,6 +65,7 @@ class MultimodalChatbot:
105
  with sr.AudioFile(wav_path) as source:
106
  recognizer.adjust_for_ambient_noise(source, duration=0.2)
107
  audio_data = recognizer.record(source)
 
108
  try:
109
  text = recognizer.recognize_google(audio_data)
110
  return text
@@ -119,47 +80,10 @@ class MultimodalChatbot:
119
  except Exception as e:
120
  return f"Error transcribing audio: {str(e)}"
121
 
122
- def extract_video_frame(self, video_file, frame_number=None):
123
- """Extract a frame from the video"""
124
- try:
125
- if isinstance(video_file, str):
126
- video_path = video_file
127
- elif hasattr(video_file, 'name'):
128
- video_path = video_file.name
129
- else:
130
- raise ValueError("Invalid video file input")
131
-
132
- cap = cv2.VideoCapture(video_path)
133
- if not cap.isOpened():
134
- return None, "Could not open video file"
135
-
136
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
137
- if total_frames <= 0:
138
- cap.release()
139
- return None, "Video has no frames"
140
-
141
- if frame_number is None:
142
- frame_number = total_frames // 2 # Extract middle frame
143
- if frame_number >= total_frames:
144
- frame_number = total_frames - 1
145
-
146
- cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
147
- ret, frame = cap.read()
148
- cap.release()
149
- if ret:
150
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
151
- return Image.fromarray(frame), f"Extracted frame {frame_number} of {total_frames}"
152
- else:
153
- return None, "Failed to extract frame"
154
- except Exception as e:
155
- return None, f"Error extracting video frame: {str(e)}"
156
-
157
  def create_multimodal_message(self,
158
  text_input: str = "",
159
  pdf_file=None,
160
- audio_file=None,
161
- image_file=None,
162
- video_file=None) -> dict:
163
  """Create a multimodal message for the API"""
164
  content_parts = []
165
  processing_info = []
@@ -169,64 +93,26 @@ class MultimodalChatbot:
169
 
170
  if pdf_file is not None:
171
  pdf_text = self.extract_pdf_text(pdf_file)
172
- content_parts.append({"type": "text", "text": f"PDF Content:\n{pdf_text}"})
 
 
 
173
  processing_info.append("πŸ“„ PDF processed")
174
 
175
  if audio_file is not None:
176
  audio_text = self.transcribe_audio(audio_file)
177
- content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
 
 
 
178
  processing_info.append("🎀 Audio transcribed")
179
 
180
- if image_file is not None and self.pipe is not None:
181
- try:
182
- if isinstance(image_file, str):
183
- image = Image.open(image_file)
184
- else:
185
- image = image_file
186
- # Use BLIP model for image captioning
187
- output = self.pipe(image)
188
- description = output[0]['generated_caption']
189
- if text_input:
190
- content_parts.append({"type": "text", "text": f"Image analysis (based on '{text_input}'): {description}"})
191
- else:
192
- content_parts.append({"type": "text", "text": f"Image analysis: {description}"})
193
- processing_info.append("πŸ–ΌοΈ Image analyzed")
194
- except Exception as e:
195
- content_parts.append({"type": "text", "text": f"Error analyzing image: {str(e)}"})
196
- processing_info.append("πŸ–ΌοΈ Image analysis failed")
197
- elif image_file is not None:
198
- content_parts.append({"type": "text", "text": "Image uploaded. Analysis failed due to model initialization error."})
199
- processing_info.append("πŸ–ΌοΈ Image received (analysis failed)")
200
-
201
- if video_file is not None and self.pipe is not None:
202
- frame, frame_info = self.extract_video_frame(video_file)
203
- if frame:
204
- try:
205
- output = self.pipe(frame)
206
- description = output[0]['generated_caption']
207
- if text_input:
208
- content_parts.append({"type": "text", "text": f"Video frame analysis (based on '{text_input}'): {description}. Frame info: {frame_info}. Please describe the video for further assistance."})
209
- else:
210
- content_parts.append({"type": "text", "text": f"Video frame analysis: {description}. Frame info: {frame_info}. Please describe the video for further assistance."})
211
- processing_info.append("πŸŽ₯ Video frame analyzed")
212
- except Exception as e:
213
- content_parts.append({"type": "text", "text": f"Error analyzing video frame: {str(e)}. Frame info: {frame_info}"})
214
- processing_info.append("πŸŽ₯ Video frame analysis failed")
215
- else:
216
- content_parts.append({"type": "text", "text": f"Could not extract frame from video: {frame_info}. Please describe the video."})
217
- processing_info.append("πŸŽ₯ Video processing failed")
218
- elif video_file is not None:
219
- content_parts.append({"type": "text", "text": "Video uploaded. Analysis failed due to model initialization error."})
220
- processing_info.append("πŸŽ₯ Video received (analysis failed)")
221
-
222
  return {"role": "user", "content": content_parts}, processing_info
223
 
224
  def chat(self,
225
  text_input: str = "",
226
  pdf_file=None,
227
  audio_file=None,
228
- image_file=None,
229
- video_file=None,
230
  history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
231
  """Main chat function"""
232
  if history is None:
@@ -240,20 +126,18 @@ class MultimodalChatbot:
240
  user_message_parts.append("πŸ“„ PDF uploaded")
241
  if audio_file:
242
  user_message_parts.append("🎀 Audio uploaded")
243
- if image_file:
244
- user_message_parts.append("πŸ–ΌοΈ Image uploaded")
245
- if video_file:
246
- user_message_parts.append("πŸŽ₯ Video uploaded")
247
 
248
  user_display = " | ".join(user_message_parts)
 
249
  user_message, processing_info = self.create_multimodal_message(
250
- text_input, pdf_file, audio_file, image_file, video_file
251
  )
252
 
253
  if processing_info:
254
  user_display += f"\n{' | '.join(processing_info)}"
255
 
256
  messages = [user_message]
 
257
  completion = self.client.chat.completions.create(
258
  extra_headers={
259
  "HTTP-Referer": "https://multimodal-chatbot.local",
@@ -267,7 +151,9 @@ class MultimodalChatbot:
267
 
268
  bot_response = completion.choices[0].message.content
269
  history.append((user_display, bot_response))
 
270
  return history, ""
 
271
  except Exception as e:
272
  error_msg = f"Error: {str(e)}"
273
  history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
@@ -275,16 +161,14 @@ class MultimodalChatbot:
275
 
276
  def create_interface():
277
  """Create the Gradio interface"""
278
- with gr.Blocks(title="Multimodal Chatbot with BLIP and Gemma", theme=gr.themes.Soft()) as demo:
279
  gr.Markdown("""
280
- # πŸ€– Multimodal Chatbot with BLIP and Gemma
281
 
282
  This chatbot can process multiple types of input:
283
- - **Text**: Regular text messages using Gemma
284
  - **PDF**: Extract and analyze document content
285
  - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
286
- - **Images**: Upload images for analysis using BLIP
287
- - **Video**: Upload videos for basic frame analysis using BLIP
288
 
289
  **Setup**: Enter your OpenRouter API key below to get started
290
  """)
@@ -314,6 +198,7 @@ def create_interface():
314
  )
315
  text_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
316
  text_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
317
  with gr.Column(scale=2):
318
  text_chatbot = gr.Chatbot(
319
  label="Text Chat History",
@@ -337,6 +222,7 @@ def create_interface():
337
  )
338
  pdf_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
339
  pdf_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
340
  with gr.Column(scale=2):
341
  pdf_chatbot = gr.Chatbot(
342
  label="PDF Chat History",
@@ -360,6 +246,7 @@ def create_interface():
360
  )
361
  audio_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
362
  audio_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
363
  with gr.Column(scale=2):
364
  audio_chatbot = gr.Chatbot(
365
  label="Audio Chat History",
@@ -368,51 +255,6 @@ def create_interface():
368
  show_copy_button=True
369
  )
370
 
371
- with gr.TabItem("πŸ–ΌοΈ Image Chat"):
372
- with gr.Row():
373
- with gr.Column(scale=1):
374
- image_input = gr.Image(
375
- label="πŸ–ΌοΈ Image Upload",
376
- type="pil"
377
- )
378
- image_text_input = gr.Textbox(
379
- label="πŸ’¬ Question about Image",
380
- placeholder="Ask something about the image...",
381
- lines=3
382
- )
383
- image_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
384
- image_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
385
- with gr.Column(scale=2):
386
- image_chatbot = gr.Chatbot(
387
- label="Image Chat History",
388
- height=600,
389
- bubble_full_width=False,
390
- show_copy_button=True
391
- )
392
-
393
- with gr.TabItem("πŸŽ₯ Video Chat"):
394
- with gr.Row():
395
- with gr.Column(scale=1):
396
- video_input = gr.File(
397
- label="πŸŽ₯ Video Upload",
398
- file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
399
- type="filepath"
400
- )
401
- video_text_input = gr.Textbox(
402
- label="πŸ’¬ Question about Video",
403
- placeholder="Ask something about the video...",
404
- lines=3
405
- )
406
- video_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
407
- video_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
408
- with gr.Column(scale=2):
409
- video_chatbot = gr.Chatbot(
410
- label="Video Chat History",
411
- height=600,
412
- bubble_full_width=False,
413
- show_copy_button=True
414
- )
415
-
416
  with gr.TabItem("🌟 Combined Chat"):
417
  with gr.Row():
418
  with gr.Column(scale=1):
@@ -431,17 +273,9 @@ def create_interface():
431
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
432
  type="filepath"
433
  )
434
- combined_image_input = gr.Image(
435
- label="πŸ–ΌοΈ Image Upload",
436
- type="pil"
437
- )
438
- combined_video_input = gr.File(
439
- label="πŸŽ₯ Video Upload",
440
- file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
441
- type="filepath"
442
- )
443
  combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
444
  combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
 
445
  with gr.Column(scale=2):
446
  combined_chatbot = gr.Chatbot(
447
  label="Combined Chat History",
@@ -452,15 +286,16 @@ def create_interface():
452
 
453
  def validate_api_key(api_key):
454
  if not api_key or len(api_key.strip()) == 0:
455
- return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
 
456
  try:
457
  test_client = OpenAI(
458
  base_url="https://openrouter.ai/api/v1",
459
  api_key=api_key.strip(),
460
  )
461
- return "βœ… API Key validated successfully", *[gr.update(interactive=True) for _ in range(6)]
462
  except Exception as e:
463
- return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(6)]
464
 
465
  def process_text_input(api_key, text, history):
466
  if not api_key or len(api_key.strip()) == 0:
@@ -468,6 +303,7 @@ def create_interface():
468
  history = []
469
  history.append(("Error", "❌ Please provide a valid API key first"))
470
  return history, ""
 
471
  chatbot = MultimodalChatbot(api_key.strip())
472
  return chatbot.chat(text_input=text, history=history)
473
 
@@ -477,6 +313,7 @@ def create_interface():
477
  history = []
478
  history.append(("Error", "❌ Please provide a valid API key first"))
479
  return history, ""
 
480
  chatbot = MultimodalChatbot(api_key.strip())
481
  return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
482
 
@@ -486,47 +323,30 @@ def create_interface():
486
  history = []
487
  history.append(("Error", "❌ Please provide a valid API key first"))
488
  return history, ""
 
489
  chatbot = MultimodalChatbot(api_key.strip())
490
  return chatbot.chat(text_input=text, audio_file=audio, history=history)
491
 
492
- def process_image_input(api_key, image, text, history):
493
- if not api_key or len(api_key.strip()) == 0:
494
- if history is None:
495
- history = []
496
- history.append(("Error", "❌ Please provide a valid API key first"))
497
- return history, ""
498
- chatbot = MultimodalChatbot(api_key.strip())
499
- return chatbot.chat(text_input=text, image_file=image, history=history)
500
-
501
- def process_video_input(api_key, video, text, history):
502
- if not api_key or len(api_key.strip()) == 0:
503
- if history is None:
504
- history = []
505
- history.append(("Error", "❌ Please provide a valid API key first"))
506
- return history, ""
507
- chatbot = MultimodalChatbot(api_key.strip())
508
- return chatbot.chat(text_input=text, video_file=video, history=history)
509
-
510
- def process_combined_input(api_key, text, pdf, audio, image, video, history):
511
  if not api_key or len(api_key.strip()) == 0:
512
  if history is None:
513
  history = []
514
  history.append(("Error", "❌ Please provide a valid API key first"))
515
  return history, ""
 
516
  chatbot = MultimodalChatbot(api_key.strip())
517
- return chatbot.chat(text_input=text, pdf_file=pdf, audio_file=audio, image_file=image, video_file=video, history=history)
518
 
519
  def clear_chat():
520
  return [], ""
521
 
522
  def clear_all_inputs():
523
- return [], "", None, None, None, None
524
 
525
  api_key_input.change(
526
  validate_api_key,
527
  inputs=[api_key_input],
528
- outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn,
529
- image_submit_btn, video_submit_btn, combined_submit_btn]
530
  )
531
 
532
  text_submit_btn.click(
@@ -555,34 +375,20 @@ def create_interface():
555
  )
556
  audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
557
 
558
- image_submit_btn.click(
559
- process_image_input,
560
- inputs=[api_key_input, image_input, image_text_input, image_chatbot],
561
- outputs=[image_chatbot, image_text_input]
562
- )
563
- image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
564
-
565
- video_submit_btn.click(
566
- process_video_input,
567
- inputs=[api_key_input, video_input, video_text_input, video_chatbot],
568
- outputs=[video_chatbot, video_text_input]
569
- )
570
- video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
571
-
572
  combined_submit_btn.click(
573
  process_combined_input,
574
  inputs=[api_key_input, combined_text_input, combined_pdf_input,
575
- combined_audio_input, combined_image_input, combined_video_input, combined_chatbot],
576
  outputs=[combined_chatbot, combined_text_input]
577
  )
578
  combined_clear_btn.click(clear_all_inputs,
579
- outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
580
- combined_audio_input, combined_image_input, combined_video_input])
581
 
582
  gr.Markdown("""
583
  ### 🎯 How to Use Each Tab:
584
 
585
- **πŸ’¬ Text Chat**: Simple text conversations with the AI using Gemma
586
 
587
  **πŸ“„ PDF Chat**: Upload a PDF and ask questions about its content
588
 
@@ -590,12 +396,6 @@ def create_interface():
590
  - Supports: WAV, MP3, M4A, FLAC, OGG formats
591
  - Best results with clear speech and minimal background noise
592
 
593
- **πŸ–ΌοΈ Image Chat**: Upload images for analysis using BLIP
594
- - Provide a text prompt to guide the analysis (e.g., "What is in this image?")
595
-
596
- **πŸŽ₯ Video Chat**: Upload videos for basic frame analysis using BLIP
597
- - Analysis is based on a single frame; provide a text description for full video context
598
-
599
  **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
600
 
601
  ### πŸ”‘ Getting an API Key:
@@ -606,10 +406,8 @@ def create_interface():
606
  5. Copy and paste it in the field above
607
 
608
  ### ⚠️ Current Limitations:
609
- - Image and video analysis may be slow on CPU in Hugging Face Spaces
610
- - Video analysis is limited to a single frame due to CPU constraints
611
  - Large files may take longer to process
612
- - BLIP model may provide basic captions; detailed video descriptions require additional user input
613
  """)
614
 
615
  return demo
@@ -619,16 +417,11 @@ if __name__ == "__main__":
619
  "gradio",
620
  "openai",
621
  "PyPDF2",
622
- "Pillow",
623
  "SpeechRecognition",
624
- "opencv-python",
625
- "numpy",
626
- "pydub",
627
- "transformers",
628
- "torch"
629
  ]
630
 
631
- print("πŸš€ Multimodal Chatbot with BLIP and Gemma")
632
  print("=" * 50)
633
  print("Required packages:", ", ".join(required_packages))
634
  print("\nπŸ“¦ To install: pip install " + " ".join(required_packages))
@@ -639,4 +432,6 @@ if __name__ == "__main__":
639
  print("πŸ’‘ Enter your API key in the web interface when it loads")
640
 
641
  demo = create_interface()
642
- demo.launch(share=True) #
 
 
 
4
  import os
5
  from openai import OpenAI
6
  import PyPDF2
 
7
  import speech_recognition as sr
8
  import tempfile
 
 
 
 
 
9
  from pydub import AudioSegment
10
+ from typing import List, Tuple, Optional
 
11
 
12
  class MultimodalChatbot:
13
  def __init__(self, api_key: str):
 
15
  base_url="https://openrouter.ai/api/v1",
16
  api_key=api_key,
17
  )
18
+ self.model = "google/gemma-3n-e2b-it:free"
19
  self.conversation_history = []
20
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def extract_pdf_text(self, pdf_file) -> str:
22
  """Extract text from PDF file"""
23
  try:
24
+ if hasattr(pdf_file, 'name'):
 
 
25
  pdf_path = pdf_file.name
26
  else:
27
+ pdf_path = pdf_file
28
 
29
  text = ""
30
  with open(pdf_path, 'rb') as file:
31
  pdf_reader = PyPDF2.PdfReader(file)
32
  for page_num, page in enumerate(pdf_reader.pages):
33
  page_text = page.extract_text()
34
+ if page_text.strip():
35
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
36
  return text.strip() if text.strip() else "No text could be extracted from this PDF."
37
  except Exception as e:
 
40
  def convert_audio_to_wav(self, audio_file) -> str:
41
  """Convert audio file to WAV format for speech recognition"""
42
  try:
43
+ if hasattr(audio_file, 'name'):
 
 
44
  audio_path = audio_file.name
45
  else:
46
+ audio_path = audio_file
47
 
48
  file_ext = os.path.splitext(audio_path)[1].lower()
49
  if file_ext == '.wav':
 
54
  audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
55
  return wav_path
56
  except Exception as e:
57
+ raise Exception(f"Error converting audio: {str(e)}")
58
 
59
  def transcribe_audio(self, audio_file) -> str:
60
  """Transcribe audio file to text"""
 
65
  with sr.AudioFile(wav_path) as source:
66
  recognizer.adjust_for_ambient_noise(source, duration=0.2)
67
  audio_data = recognizer.record(source)
68
+
69
  try:
70
  text = recognizer.recognize_google(audio_data)
71
  return text
 
80
  except Exception as e:
81
  return f"Error transcribing audio: {str(e)}"
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  def create_multimodal_message(self,
84
  text_input: str = "",
85
  pdf_file=None,
86
+ audio_file=None) -> dict:
 
 
87
  """Create a multimodal message for the API"""
88
  content_parts = []
89
  processing_info = []
 
93
 
94
  if pdf_file is not None:
95
  pdf_text = self.extract_pdf_text(pdf_file)
96
+ content_parts.append({
97
+ "type": "text",
98
+ "text": f"PDF Content:\n{pdf_text}"
99
+ })
100
  processing_info.append("πŸ“„ PDF processed")
101
 
102
  if audio_file is not None:
103
  audio_text = self.transcribe_audio(audio_file)
104
+ content_parts.append({
105
+ "type": "text",
106
+ "text": f"Audio Transcription:\n{audio_text}"
107
+ })
108
  processing_info.append("🎀 Audio transcribed")
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  return {"role": "user", "content": content_parts}, processing_info
111
 
112
  def chat(self,
113
  text_input: str = "",
114
  pdf_file=None,
115
  audio_file=None,
 
 
116
  history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
117
  """Main chat function"""
118
  if history is None:
 
126
  user_message_parts.append("πŸ“„ PDF uploaded")
127
  if audio_file:
128
  user_message_parts.append("🎀 Audio uploaded")
 
 
 
 
129
 
130
  user_display = " | ".join(user_message_parts)
131
+
132
  user_message, processing_info = self.create_multimodal_message(
133
+ text_input, pdf_file, audio_file
134
  )
135
 
136
  if processing_info:
137
  user_display += f"\n{' | '.join(processing_info)}"
138
 
139
  messages = [user_message]
140
+
141
  completion = self.client.chat.completions.create(
142
  extra_headers={
143
  "HTTP-Referer": "https://multimodal-chatbot.local",
 
151
 
152
  bot_response = completion.choices[0].message.content
153
  history.append((user_display, bot_response))
154
+
155
  return history, ""
156
+
157
  except Exception as e:
158
  error_msg = f"Error: {str(e)}"
159
  history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
 
161
 
162
  def create_interface():
163
  """Create the Gradio interface"""
164
+ with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
165
  gr.Markdown("""
166
+ # πŸ€– Multimodal Chatbot with Gemma 3n
167
 
168
  This chatbot can process multiple types of input:
169
+ - **Text**: Regular text messages
170
  - **PDF**: Extract and analyze document content
171
  - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
 
 
172
 
173
  **Setup**: Enter your OpenRouter API key below to get started
174
  """)
 
198
  )
199
  text_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
200
  text_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
201
+
202
  with gr.Column(scale=2):
203
  text_chatbot = gr.Chatbot(
204
  label="Text Chat History",
 
222
  )
223
  pdf_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
224
  pdf_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
225
+
226
  with gr.Column(scale=2):
227
  pdf_chatbot = gr.Chatbot(
228
  label="PDF Chat History",
 
246
  )
247
  audio_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
248
  audio_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
249
+
250
  with gr.Column(scale=2):
251
  audio_chatbot = gr.Chatbot(
252
  label="Audio Chat History",
 
255
  show_copy_button=True
256
  )
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  with gr.TabItem("🌟 Combined Chat"):
259
  with gr.Row():
260
  with gr.Column(scale=1):
 
273
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
274
  type="filepath"
275
  )
 
 
 
 
 
 
 
 
 
276
  combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
277
  combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
278
+
279
  with gr.Column(scale=2):
280
  combined_chatbot = gr.Chatbot(
281
  label="Combined Chat History",
 
286
 
287
  def validate_api_key(api_key):
288
  if not api_key or len(api_key.strip()) == 0:
289
+ return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(4)]
290
+
291
  try:
292
  test_client = OpenAI(
293
  base_url="https://openrouter.ai/api/v1",
294
  api_key=api_key.strip(),
295
  )
296
+ return "βœ… API Key validated successfully", *[gr.update(interactive=True) for _ in range(4)]
297
  except Exception as e:
298
+ return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(4)]
299
 
300
  def process_text_input(api_key, text, history):
301
  if not api_key or len(api_key.strip()) == 0:
 
303
  history = []
304
  history.append(("Error", "❌ Please provide a valid API key first"))
305
  return history, ""
306
+
307
  chatbot = MultimodalChatbot(api_key.strip())
308
  return chatbot.chat(text_input=text, history=history)
309
 
 
313
  history = []
314
  history.append(("Error", "❌ Please provide a valid API key first"))
315
  return history, ""
316
+
317
  chatbot = MultimodalChatbot(api_key.strip())
318
  return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
319
 
 
323
  history = []
324
  history.append(("Error", "❌ Please provide a valid API key first"))
325
  return history, ""
326
+
327
  chatbot = MultimodalChatbot(api_key.strip())
328
  return chatbot.chat(text_input=text, audio_file=audio, history=history)
329
 
330
+ def process_combined_input(api_key, text, pdf, audio, history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  if not api_key or len(api_key.strip()) == 0:
332
  if history is None:
333
  history = []
334
  history.append(("Error", "❌ Please provide a valid API key first"))
335
  return history, ""
336
+
337
  chatbot = MultimodalChatbot(api_key.strip())
338
+ return chatbot.chat(text, pdf, audio, history)
339
 
340
  def clear_chat():
341
  return [], ""
342
 
343
  def clear_all_inputs():
344
+ return [], "", None, None
345
 
346
  api_key_input.change(
347
  validate_api_key,
348
  inputs=[api_key_input],
349
+ outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, combined_submit_btn]
 
350
  )
351
 
352
  text_submit_btn.click(
 
375
  )
376
  audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  combined_submit_btn.click(
379
  process_combined_input,
380
  inputs=[api_key_input, combined_text_input, combined_pdf_input,
381
+ combined_audio_input, combined_chatbot],
382
  outputs=[combined_chatbot, combined_text_input]
383
  )
384
  combined_clear_btn.click(clear_all_inputs,
385
+ outputs=[combined_chatbot, combined_text_input,
386
+ combined_pdf_input, combined_audio_input])
387
 
388
  gr.Markdown("""
389
  ### 🎯 How to Use Each Tab:
390
 
391
+ **πŸ’¬ Text Chat**: Simple text conversations with the AI
392
 
393
  **πŸ“„ PDF Chat**: Upload a PDF and ask questions about its content
394
 
 
396
  - Supports: WAV, MP3, M4A, FLAC, OGG formats
397
  - Best results with clear speech and minimal background noise
398
 
 
 
 
 
 
 
399
  **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
400
 
401
  ### πŸ”‘ Getting an API Key:
 
406
  5. Copy and paste it in the field above
407
 
408
  ### ⚠️ Current Limitations:
409
+ - Audio transcription requires internet connection for best results
 
410
  - Large files may take longer to process
 
411
  """)
412
 
413
  return demo
 
417
  "gradio",
418
  "openai",
419
  "PyPDF2",
 
420
  "SpeechRecognition",
421
+ "pydub"
 
 
 
 
422
  ]
423
 
424
+ print("πŸš€ Multimodal Chatbot with Gemma 3n")
425
  print("=" * 50)
426
  print("Required packages:", ", ".join(required_packages))
427
  print("\nπŸ“¦ To install: pip install " + " ".join(required_packages))
 
432
  print("πŸ’‘ Enter your API key in the web interface when it loads")
433
 
434
  demo = create_interface()
435
+ demo.launch(
436
+ share=True
437
+ )