shukdevdattaEX commited on
Commit
8c4798d
Β·
verified Β·
1 Parent(s): 46e842f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +430 -130
app.py CHANGED
@@ -11,6 +11,8 @@ import cv2
11
  import numpy as np
12
  from typing import List, Tuple, Optional
13
  import json
 
 
14
 
15
  class MultimodalChatbot:
16
  def __init__(self, api_key: str):
@@ -23,15 +25,21 @@ class MultimodalChatbot:
23
 
24
  def encode_image_to_base64(self, image) -> str:
25
  """Convert PIL Image to base64 string"""
26
- if isinstance(image, str):
27
- # If it's a file path
28
- with open(image, "rb") as img_file:
29
- return base64.b64encode(img_file.read()).decode('utf-8')
30
- else:
31
- # If it's a PIL Image
32
- buffered = io.BytesIO()
33
- image.save(buffered, format="PNG")
34
- return base64.b64encode(buffered.getvalue()).decode('utf-8')
 
 
 
 
 
 
35
 
36
  def extract_pdf_text(self, pdf_file) -> str:
37
  """Extract text from PDF file"""
@@ -45,30 +53,70 @@ class MultimodalChatbot:
45
  text = ""
46
  with open(pdf_path, 'rb') as file:
47
  pdf_reader = PyPDF2.PdfReader(file)
48
- for page in pdf_reader.pages:
49
- text += page.extract_text() + "\n"
50
- return text.strip()
 
 
51
  except Exception as e:
52
  return f"Error extracting PDF: {str(e)}"
53
 
54
- def transcribe_audio(self, audio_file) -> str:
55
- """Transcribe audio file to text"""
56
  try:
57
- recognizer = sr.Recognizer()
58
-
59
  if hasattr(audio_file, 'name'):
60
  audio_path = audio_file.name
61
  else:
62
  audio_path = audio_file
63
-
64
- with sr.AudioFile(audio_path) as source:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  audio_data = recognizer.record(source)
66
- text = recognizer.recognize_google(audio_data)
67
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  except Exception as e:
69
  return f"Error transcribing audio: {str(e)}"
70
 
71
- def process_video(self, video_file) -> List[str]:
72
  """Extract frames from video and convert to base64"""
73
  try:
74
  if hasattr(video_file, 'name'):
@@ -77,24 +125,43 @@ class MultimodalChatbot:
77
  video_path = video_file
78
 
79
  cap = cv2.VideoCapture(video_path)
 
 
 
80
  frames = []
 
81
  frame_count = 0
 
 
 
 
 
82
 
83
- # Extract frames (every 30 frames to avoid too many)
84
- while cap.read()[0] and frame_count < 10: # Limit to 10 frames
85
  ret, frame = cap.read()
86
- if ret and frame_count % 30 == 0:
87
  # Convert BGR to RGB
88
  rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
89
  pil_image = Image.fromarray(rgb_frame)
 
 
 
 
90
  base64_frame = self.encode_image_to_base64(pil_image)
91
- frames.append(base64_frame)
 
 
 
 
92
  frame_count += 1
93
 
94
  cap.release()
95
- return frames
 
 
 
96
  except Exception as e:
97
- return [f"Error processing video: {str(e)}"]
98
 
99
  def create_multimodal_message(self,
100
  text_input: str = "",
@@ -105,6 +172,7 @@ class MultimodalChatbot:
105
  """Create a multimodal message for the API"""
106
 
107
  content_parts = []
 
108
 
109
  # Add text content
110
  if text_input:
@@ -117,6 +185,7 @@ class MultimodalChatbot:
117
  "type": "text",
118
  "text": f"PDF Content:\n{pdf_text}"
119
  })
 
120
 
121
  # Process Audio
122
  if audio_file is not None:
@@ -125,30 +194,35 @@ class MultimodalChatbot:
125
  "type": "text",
126
  "text": f"Audio Transcription:\n{audio_text}"
127
  })
 
128
 
129
- # Process Image
130
  if image_file is not None:
131
- image_base64 = self.encode_image_to_base64(image_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  content_parts.append({
133
- "type": "image_url",
134
- "image_url": {
135
- "url": f"data:image/png;base64,{image_base64}"
136
- }
137
  })
 
138
 
139
- # Process Video
140
- if video_file is not None:
141
- video_frames = self.process_video(video_file)
142
- for i, frame_base64 in enumerate(video_frames):
143
- if not frame_base64.startswith("Error"):
144
- content_parts.append({
145
- "type": "image_url",
146
- "image_url": {
147
- "url": f"data:image/png;base64,{frame_base64}"
148
- }
149
- })
150
-
151
- return {"role": "user", "content": content_parts}
152
 
153
  def chat(self,
154
  text_input: str = "",
@@ -179,10 +253,14 @@ class MultimodalChatbot:
179
  user_display = " | ".join(user_message_parts)
180
 
181
  # Create multimodal message
182
- user_message = self.create_multimodal_message(
183
  text_input, pdf_file, audio_file, image_file, video_file
184
  )
185
 
 
 
 
 
186
  # Add to conversation history
187
  messages = [user_message]
188
 
@@ -194,7 +272,7 @@ class MultimodalChatbot:
194
  },
195
  model=self.model,
196
  messages=messages,
197
- max_tokens=1024,
198
  temperature=0.7
199
  )
200
 
@@ -213,9 +291,6 @@ class MultimodalChatbot:
213
  def create_interface():
214
  """Create the Gradio interface"""
215
 
216
- # Chatbot will be initialized when API key is provided
217
- chatbot = None
218
-
219
  with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
220
  gr.Markdown("""
221
  # πŸ€– Multimodal Chatbot with Gemma 3n
@@ -223,9 +298,9 @@ def create_interface():
223
  This chatbot can process multiple types of input:
224
  - **Text**: Regular text messages
225
  - **PDF**: Extract and analyze document content
226
- - **Audio**: Transcribe speech to text
227
- - **Images**: Analyze visual content
228
- - **Video**: Extract frames and analyze video content
229
 
230
  **Setup**: Enter your OpenRouter API key below to get started
231
  """)
@@ -245,53 +320,175 @@ def create_interface():
245
  interactive=False
246
  )
247
 
248
- with gr.Row():
249
- with gr.Column(scale=1):
250
- # Input components
251
- text_input = gr.Textbox(
252
- label="πŸ’¬ Text Input",
253
- placeholder="Type your message here...",
254
- lines=3
255
- )
256
-
257
- pdf_input = gr.File(
258
- label="πŸ“„ PDF Upload",
259
- file_types=[".pdf"],
260
- type="filepath"
261
- )
262
-
263
- audio_input = gr.File(
264
- label="🎀 Audio Upload",
265
- file_types=[".wav", ".mp3", ".m4a", ".flac"],
266
- type="filepath"
267
- )
268
-
269
- image_input = gr.Image(
270
- label="πŸ–ΌοΈ Image Upload",
271
- type="pil"
272
- )
273
-
274
- video_input = gr.File(
275
- label="πŸŽ₯ Video Upload",
276
- file_types=[".mp4", ".avi", ".mov", ".mkv"],
277
- type="filepath"
278
- )
279
-
280
- submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
281
- clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
282
-
283
- with gr.Column(scale=2):
284
- # Chat interface
285
- chatbot_interface = gr.Chatbot(
286
- label="Chat History",
287
- height=600,
288
- bubble_full_width=False
289
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  # Event handlers
292
  def validate_api_key(api_key):
293
  if not api_key or len(api_key.strip()) == 0:
294
- return "❌ API Key not provided", gr.update(interactive=False)
295
 
296
  try:
297
  # Test the API key by creating a client
@@ -299,60 +496,157 @@ def create_interface():
299
  base_url="https://openrouter.ai/api/v1",
300
  api_key=api_key.strip(),
301
  )
302
- return "βœ… API Key validated successfully", gr.update(interactive=True)
303
  except Exception as e:
304
- return f"❌ API Key validation failed: {str(e)}", gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- def process_input(api_key, text, pdf, audio, image, video, history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  if not api_key or len(api_key.strip()) == 0:
308
  if history is None:
309
  history = []
310
  history.append(("Error", "❌ Please provide a valid API key first"))
311
  return history, ""
312
 
313
- # Initialize chatbot with the provided API key
314
  chatbot = MultimodalChatbot(api_key.strip())
315
  return chatbot.chat(text, pdf, audio, image, video, history)
316
 
317
- def clear_all():
 
 
 
318
  return [], "", None, None, None, None
319
 
320
  # API Key validation
321
  api_key_input.change(
322
  validate_api_key,
323
  inputs=[api_key_input],
324
- outputs=[api_status, submit_btn]
 
325
  )
326
 
327
- # Button events
328
- submit_btn.click(
329
- process_input,
330
- inputs=[api_key_input, text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
331
- outputs=[chatbot_interface, text_input]
332
  )
 
 
 
 
 
 
333
 
334
- clear_btn.click(
335
- clear_all,
336
- outputs=[chatbot_interface, text_input, pdf_input, audio_input, image_input, video_input]
 
 
337
  )
 
338
 
339
- # Enter key support
340
- text_input.submit(
341
- process_input,
342
- inputs=[api_key_input, text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
343
- outputs=[chatbot_interface, text_input]
344
  )
 
345
 
346
- # Examples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  gr.Markdown("""
348
- ### 🎯 Example Usage:
349
- 1. **First**: Enter your OpenRouter API key in the field above
350
- 2. **Then try these examples**:
351
- - Upload a PDF and ask "Summarize this document"
352
- - Upload an image and ask "What do you see in this image?"
353
- - Record audio and ask "What did I say?"
354
- - Upload a video and ask "Describe what's happening"
355
- - Combine multiple inputs: "Compare this image with the PDF content"
 
 
 
 
 
 
 
356
 
357
  ### πŸ”‘ Getting an API Key:
358
  1. Go to [OpenRouter.ai](https://openrouter.ai)
@@ -360,6 +654,11 @@ def create_interface():
360
  3. Navigate to the API Keys section
361
  4. Create a new API key
362
  5. Copy and paste it in the field above
 
 
 
 
 
363
  """)
364
 
365
  return demo
@@ -373,20 +672,21 @@ if __name__ == "__main__":
373
  "Pillow",
374
  "SpeechRecognition",
375
  "opencv-python",
376
- "numpy"
 
377
  ]
378
 
379
  print("πŸš€ Multimodal Chatbot with Gemma 3n")
380
  print("=" * 50)
381
  print("Required packages:", ", ".join(required_packages))
382
  print("\nπŸ“¦ To install: pip install " + " ".join(required_packages))
 
 
 
383
  print("\nπŸ”‘ Get your API key from: https://openrouter.ai")
384
  print("πŸ’‘ Enter your API key in the web interface when it loads")
385
 
386
  demo = create_interface()
387
  demo.launch(
388
- share=True,
389
- server_name="0.0.0.0",
390
- server_port=7860,
391
- debug=True
392
  )
 
11
  import numpy as np
12
  from typing import List, Tuple, Optional
13
  import json
14
+ import pydub
15
+ from pydub import AudioSegment
16
 
17
  class MultimodalChatbot:
18
  def __init__(self, api_key: str):
 
25
 
26
  def encode_image_to_base64(self, image) -> str:
27
  """Convert PIL Image to base64 string"""
28
+ try:
29
+ if isinstance(image, str):
30
+ # If it's a file path
31
+ with open(image, "rb") as img_file:
32
+ return base64.b64encode(img_file.read()).decode('utf-8')
33
+ else:
34
+ # If it's a PIL Image
35
+ buffered = io.BytesIO()
36
+ # Convert to RGB if it's RGBA
37
+ if image.mode == 'RGBA':
38
+ image = image.convert('RGB')
39
+ image.save(buffered, format="JPEG", quality=85)
40
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')
41
+ except Exception as e:
42
+ return f"Error encoding image: {str(e)}"
43
 
44
  def extract_pdf_text(self, pdf_file) -> str:
45
  """Extract text from PDF file"""
 
53
  text = ""
54
  with open(pdf_path, 'rb') as file:
55
  pdf_reader = PyPDF2.PdfReader(file)
56
+ for page_num, page in enumerate(pdf_reader.pages):
57
+ page_text = page.extract_text()
58
+ if page_text.strip():
59
+ text += f"Page {page_num + 1}:\n{page_text}\n\n"
60
+ return text.strip() if text.strip() else "No text could be extracted from this PDF."
61
  except Exception as e:
62
  return f"Error extracting PDF: {str(e)}"
63
 
64
+ def convert_audio_to_wav(self, audio_file) -> str:
65
+ """Convert audio file to WAV format for speech recognition"""
66
  try:
 
 
67
  if hasattr(audio_file, 'name'):
68
  audio_path = audio_file.name
69
  else:
70
  audio_path = audio_file
71
+
72
+ # Get file extension
73
+ file_ext = os.path.splitext(audio_path)[1].lower()
74
+
75
+ # If already WAV, return as is
76
+ if file_ext == '.wav':
77
+ return audio_path
78
+
79
+ # Convert to WAV using pydub
80
+ audio = AudioSegment.from_file(audio_path)
81
+ # Export as WAV with proper settings for speech recognition
82
+ wav_path = tempfile.mktemp(suffix='.wav')
83
+ audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
84
+ return wav_path
85
+
86
+ except Exception as e:
87
+ raise Exception(f"Error converting audio: {str(e)}")
88
+
89
+ def transcribe_audio(self, audio_file) -> str:
90
+ """Transcribe audio file to text"""
91
+ try:
92
+ recognizer = sr.Recognizer()
93
+
94
+ # Convert audio to WAV format
95
+ wav_path = self.convert_audio_to_wav(audio_file)
96
+
97
+ with sr.AudioFile(wav_path) as source:
98
+ # Adjust for ambient noise
99
+ recognizer.adjust_for_ambient_noise(source, duration=0.2)
100
  audio_data = recognizer.record(source)
101
+
102
+ # Try Google Speech Recognition
103
+ try:
104
+ text = recognizer.recognize_google(audio_data)
105
+ return text
106
+ except sr.UnknownValueError:
107
+ return "Could not understand the audio. Please try with clearer audio."
108
+ except sr.RequestError as e:
109
+ # Fallback to offline recognition if available
110
+ try:
111
+ text = recognizer.recognize_sphinx(audio_data)
112
+ return text
113
+ except:
114
+ return f"Speech recognition service error: {str(e)}"
115
+
116
  except Exception as e:
117
  return f"Error transcribing audio: {str(e)}"
118
 
119
+ def process_video(self, video_file) -> Tuple[List[str], str]:
120
  """Extract frames from video and convert to base64"""
121
  try:
122
  if hasattr(video_file, 'name'):
 
125
  video_path = video_file
126
 
127
  cap = cv2.VideoCapture(video_path)
128
+ if not cap.isOpened():
129
+ return [], "Error: Could not open video file"
130
+
131
  frames = []
132
+ frame_descriptions = []
133
  frame_count = 0
134
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
135
+ fps = cap.get(cv2.CAP_PROP_FPS)
136
+
137
+ # Extract frames (every 60 frames or every 2 seconds)
138
+ frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
139
 
140
+ while cap.read()[0] and len(frames) < 5: # Limit to 5 frames
 
141
  ret, frame = cap.read()
142
+ if ret and frame_count % frame_interval == 0:
143
  # Convert BGR to RGB
144
  rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
145
  pil_image = Image.fromarray(rgb_frame)
146
+
147
+ # Resize image to reduce size
148
+ pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
149
+
150
  base64_frame = self.encode_image_to_base64(pil_image)
151
+ if not base64_frame.startswith("Error"):
152
+ frames.append(base64_frame)
153
+ timestamp = frame_count / fps if fps > 0 else frame_count
154
+ frame_descriptions.append(f"Frame at {timestamp:.1f}s")
155
+
156
  frame_count += 1
157
 
158
  cap.release()
159
+
160
+ description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
161
+ return frames, description
162
+
163
  except Exception as e:
164
+ return [], f"Error processing video: {str(e)}"
165
 
166
  def create_multimodal_message(self,
167
  text_input: str = "",
 
172
  """Create a multimodal message for the API"""
173
 
174
  content_parts = []
175
+ processing_info = []
176
 
177
  # Add text content
178
  if text_input:
 
185
  "type": "text",
186
  "text": f"PDF Content:\n{pdf_text}"
187
  })
188
+ processing_info.append("πŸ“„ PDF processed")
189
 
190
  # Process Audio
191
  if audio_file is not None:
 
194
  "type": "text",
195
  "text": f"Audio Transcription:\n{audio_text}"
196
  })
197
+ processing_info.append("🎀 Audio transcribed")
198
 
199
+ # Process Image - Use text-only approach since vision isn't supported
200
  if image_file is not None:
201
+ # Since vision isn't supported, we'll describe what we can about the image
202
+ if hasattr(image_file, 'size'):
203
+ width, height = image_file.size
204
+ mode = image_file.mode
205
+ content_parts.append({
206
+ "type": "text",
207
+ "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Note: Visual analysis not available with current model configuration."
208
+ })
209
+ else:
210
+ content_parts.append({
211
+ "type": "text",
212
+ "text": "Image uploaded. Note: Visual analysis not available with current model configuration."
213
+ })
214
+ processing_info.append("πŸ–ΌοΈ Image received (metadata only)")
215
+
216
+ # Process Video - Use text-only approach since vision isn't supported
217
+ if video_file is not None:
218
+ frames, video_desc = self.process_video(video_file)
219
  content_parts.append({
220
+ "type": "text",
221
+ "text": f"Video uploaded: {video_desc}. Note: Visual analysis not available with current model configuration."
 
 
222
  })
223
+ processing_info.append("πŸŽ₯ Video processed (metadata only)")
224
 
225
+ return {"role": "user", "content": content_parts}, processing_info
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  def chat(self,
228
  text_input: str = "",
 
253
  user_display = " | ".join(user_message_parts)
254
 
255
  # Create multimodal message
256
+ user_message, processing_info = self.create_multimodal_message(
257
  text_input, pdf_file, audio_file, image_file, video_file
258
  )
259
 
260
+ # Add processing info to display
261
+ if processing_info:
262
+ user_display += f"\n{' | '.join(processing_info)}"
263
+
264
  # Add to conversation history
265
  messages = [user_message]
266
 
 
272
  },
273
  model=self.model,
274
  messages=messages,
275
+ max_tokens=2048,
276
  temperature=0.7
277
  )
278
 
 
291
  def create_interface():
292
  """Create the Gradio interface"""
293
 
 
 
 
294
  with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
295
  gr.Markdown("""
296
  # πŸ€– Multimodal Chatbot with Gemma 3n
 
298
  This chatbot can process multiple types of input:
299
  - **Text**: Regular text messages
300
  - **PDF**: Extract and analyze document content
301
+ - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
302
+ - **Images**: Upload images (metadata analysis only due to model limitations)
303
+ - **Video**: Upload videos (metadata analysis only due to model limitations)
304
 
305
  **Setup**: Enter your OpenRouter API key below to get started
306
  """)
 
320
  interactive=False
321
  )
322
 
323
+ # Tabbed Interface
324
+ with gr.Tabs():
325
+ # Text Chat Tab
326
+ with gr.TabItem("πŸ’¬ Text Chat"):
327
+ with gr.Row():
328
+ with gr.Column(scale=1):
329
+ text_input = gr.Textbox(
330
+ label="πŸ’¬ Text Input",
331
+ placeholder="Type your message here...",
332
+ lines=5
333
+ )
334
+ text_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
335
+ text_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
336
+
337
+ with gr.Column(scale=2):
338
+ text_chatbot = gr.Chatbot(
339
+ label="Text Chat History",
340
+ height=600,
341
+ bubble_full_width=False,
342
+ show_copy_button=True
343
+ )
344
+
345
+ # PDF Chat Tab
346
+ with gr.TabItem("πŸ“„ PDF Chat"):
347
+ with gr.Row():
348
+ with gr.Column(scale=1):
349
+ pdf_input = gr.File(
350
+ label="πŸ“„ PDF Upload",
351
+ file_types=[".pdf"],
352
+ type="filepath"
353
+ )
354
+ pdf_text_input = gr.Textbox(
355
+ label="πŸ’¬ Question about PDF",
356
+ placeholder="Ask something about the PDF...",
357
+ lines=3
358
+ )
359
+ pdf_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
360
+ pdf_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
361
+
362
+ with gr.Column(scale=2):
363
+ pdf_chatbot = gr.Chatbot(
364
+ label="PDF Chat History",
365
+ height=600,
366
+ bubble_full_width=False,
367
+ show_copy_button=True
368
+ )
369
+
370
+ # Audio Chat Tab
371
+ with gr.TabItem("🎀 Audio Chat"):
372
+ with gr.Row():
373
+ with gr.Column(scale=1):
374
+ audio_input = gr.File(
375
+ label="🎀 Audio Upload",
376
+ file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
377
+ type="filepath"
378
+ )
379
+ audio_text_input = gr.Textbox(
380
+ label="πŸ’¬ Question about Audio",
381
+ placeholder="Ask something about the audio...",
382
+ lines=3
383
+ )
384
+ audio_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
385
+ audio_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
386
+
387
+ with gr.Column(scale=2):
388
+ audio_chatbot = gr.Chatbot(
389
+ label="Audio Chat History",
390
+ height=600,
391
+ bubble_full_width=False,
392
+ show_copy_button=True
393
+ )
394
+
395
+ # Image Chat Tab
396
+ with gr.TabItem("πŸ–ΌοΈ Image Chat"):
397
+ with gr.Row():
398
+ with gr.Column(scale=1):
399
+ image_input = gr.Image(
400
+ label="πŸ–ΌοΈ Image Upload",
401
+ type="pil"
402
+ )
403
+ image_text_input = gr.Textbox(
404
+ label="πŸ’¬ Question about Image",
405
+ placeholder="Ask something about the image...",
406
+ lines=3
407
+ )
408
+ image_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
409
+ image_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
410
+
411
+ with gr.Column(scale=2):
412
+ image_chatbot = gr.Chatbot(
413
+ label="Image Chat History",
414
+ height=600,
415
+ bubble_full_width=False,
416
+ show_copy_button=True
417
+ )
418
+
419
+ # Video Chat Tab
420
+ with gr.TabItem("πŸŽ₯ Video Chat"):
421
+ with gr.Row():
422
+ with gr.Column(scale=1):
423
+ video_input = gr.File(
424
+ label="πŸŽ₯ Video Upload",
425
+ file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
426
+ type="filepath"
427
+ )
428
+ video_text_input = gr.Textbox(
429
+ label="πŸ’¬ Question about Video",
430
+ placeholder="Ask something about the video...",
431
+ lines=3
432
+ )
433
+ video_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
434
+ video_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
435
+
436
+ with gr.Column(scale=2):
437
+ video_chatbot = gr.Chatbot(
438
+ label="Video Chat History",
439
+ height=600,
440
+ bubble_full_width=False,
441
+ show_copy_button=True
442
+ )
443
+
444
+ # Combined Chat Tab
445
+ with gr.TabItem("🌟 Combined Chat"):
446
+ with gr.Row():
447
+ with gr.Column(scale=1):
448
+ combined_text_input = gr.Textbox(
449
+ label="πŸ’¬ Text Input",
450
+ placeholder="Type your message here...",
451
+ lines=3
452
+ )
453
+
454
+ combined_pdf_input = gr.File(
455
+ label="πŸ“„ PDF Upload",
456
+ file_types=[".pdf"],
457
+ type="filepath"
458
+ )
459
+
460
+ combined_audio_input = gr.File(
461
+ label="🎀 Audio Upload",
462
+ file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
463
+ type="filepath"
464
+ )
465
+
466
+ combined_image_input = gr.Image(
467
+ label="πŸ–ΌοΈ Image Upload",
468
+ type="pil"
469
+ )
470
+
471
+ combined_video_input = gr.File(
472
+ label="πŸŽ₯ Video Upload",
473
+ file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
474
+ type="filepath"
475
+ )
476
+
477
+ combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
478
+ combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
479
+
480
+ with gr.Column(scale=2):
481
+ combined_chatbot = gr.Chatbot(
482
+ label="Combined Chat History",
483
+ height=600,
484
+ bubble_full_width=False,
485
+ show_copy_button=True
486
+ )
487
 
488
  # Event handlers
489
  def validate_api_key(api_key):
490
  if not api_key or len(api_key.strip()) == 0:
491
+ return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
492
 
493
  try:
494
  # Test the API key by creating a client
 
496
  base_url="https://openrouter.ai/api/v1",
497
  api_key=api_key.strip(),
498
  )
499
+ return "βœ… API Key validated successfully", *[gr.update(interactive=True) for _ in range(6)]
500
  except Exception as e:
501
+ return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(6)]
502
+
503
+ def process_text_input(api_key, text, history):
504
+ if not api_key or len(api_key.strip()) == 0:
505
+ if history is None:
506
+ history = []
507
+ history.append(("Error", "❌ Please provide a valid API key first"))
508
+ return history, ""
509
+
510
+ chatbot = MultimodalChatbot(api_key.strip())
511
+ return chatbot.chat(text_input=text, history=history)
512
+
513
+ def process_pdf_input(api_key, pdf, text, history):
514
+ if not api_key or len(api_key.strip()) == 0:
515
+ if history is None:
516
+ history = []
517
+ history.append(("Error", "❌ Please provide a valid API key first"))
518
+ return history, ""
519
+
520
+ chatbot = MultimodalChatbot(api_key.strip())
521
+ return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
522
 
523
+ def process_audio_input(api_key, audio, text, history):
524
+ if not api_key or len(api_key.strip()) == 0:
525
+ if history is None:
526
+ history = []
527
+ history.append(("Error", "❌ Please provide a valid API key first"))
528
+ return history, ""
529
+
530
+ chatbot = MultimodalChatbot(api_key.strip())
531
+ return chatbot.chat(text_input=text, audio_file=audio, history=history)
532
+
533
+ def process_image_input(api_key, image, text, history):
534
+ if not api_key or len(api_key.strip()) == 0:
535
+ if history is None:
536
+ history = []
537
+ history.append(("Error", "❌ Please provide a valid API key first"))
538
+ return history, ""
539
+
540
+ chatbot = MultimodalChatbot(api_key.strip())
541
+ return chatbot.chat(text_input=text, image_file=image, history=history)
542
+
543
+ def process_video_input(api_key, video, text, history):
544
+ if not api_key or len(api_key.strip()) == 0:
545
+ if history is None:
546
+ history = []
547
+ history.append(("Error", "❌ Please provide a valid API key first"))
548
+ return history, ""
549
+
550
+ chatbot = MultimodalChatbot(api_key.strip())
551
+ return chatbot.chat(text_input=text, video_file=video, history=history)
552
+
553
+ def process_combined_input(api_key, text, pdf, audio, image, video, history):
554
  if not api_key or len(api_key.strip()) == 0:
555
  if history is None:
556
  history = []
557
  history.append(("Error", "❌ Please provide a valid API key first"))
558
  return history, ""
559
 
 
560
  chatbot = MultimodalChatbot(api_key.strip())
561
  return chatbot.chat(text, pdf, audio, image, video, history)
562
 
563
+ def clear_chat():
564
+ return [], ""
565
+
566
+ def clear_all_inputs():
567
  return [], "", None, None, None, None
568
 
569
  # API Key validation
570
  api_key_input.change(
571
  validate_api_key,
572
  inputs=[api_key_input],
573
+ outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn,
574
+ image_submit_btn, video_submit_btn, combined_submit_btn]
575
  )
576
 
577
+ # Text chat events
578
+ text_submit_btn.click(
579
+ process_text_input,
580
+ inputs=[api_key_input, text_input, text_chatbot],
581
+ outputs=[text_chatbot, text_input]
582
  )
583
+ text_input.submit(
584
+ process_text_input,
585
+ inputs=[api_key_input, text_input, text_chatbot],
586
+ outputs=[text_chatbot, text_input]
587
+ )
588
+ text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
589
 
590
+ # PDF chat events
591
+ pdf_submit_btn.click(
592
+ process_pdf_input,
593
+ inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
594
+ outputs=[pdf_chatbot, pdf_text_input]
595
  )
596
+ pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
597
 
598
+ # Audio chat events
599
+ audio_submit_btn.click(
600
+ process_audio_input,
601
+ inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
602
+ outputs=[audio_chatbot, audio_text_input]
603
  )
604
+ audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
605
 
606
+ # Image chat events
607
+ image_submit_btn.click(
608
+ process_image_input,
609
+ inputs=[api_key_input, image_input, image_text_input, image_chatbot],
610
+ outputs=[image_chatbot, image_text_input]
611
+ )
612
+ image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
613
+
614
+ # Video chat events
615
+ video_submit_btn.click(
616
+ process_video_input,
617
+ inputs=[api_key_input, video_input, video_text_input, video_chatbot],
618
+ outputs=[video_chatbot, video_text_input]
619
+ )
620
+ video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
621
+
622
+ # Combined chat events
623
+ combined_submit_btn.click(
624
+ process_combined_input,
625
+ inputs=[api_key_input, combined_text_input, combined_pdf_input,
626
+ combined_audio_input, combined_image_input, combined_video_input, combined_chatbot],
627
+ outputs=[combined_chatbot, combined_text_input]
628
+ )
629
+ combined_clear_btn.click(clear_all_inputs,
630
+ outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
631
+ combined_audio_input, combined_image_input, combined_video_input])
632
+
633
+ # Examples and Instructions
634
  gr.Markdown("""
635
+ ### 🎯 How to Use Each Tab:
636
+
637
+ **πŸ’¬ Text Chat**: Simple text conversations with the AI
638
+
639
+ **πŸ“„ PDF Chat**: Upload a PDF and ask questions about its content
640
+
641
+ **🎀 Audio Chat**: Upload audio files for transcription and analysis
642
+ - Supports: WAV, MP3, M4A, FLAC, OGG formats
643
+ - Best results with clear speech and minimal background noise
644
+
645
+ **πŸ–ΌοΈ Image Chat**: Upload images (currently metadata only due to model limitations)
646
+
647
+ **πŸŽ₯ Video Chat**: Upload videos (currently metadata only due to model limitations)
648
+
649
+ **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
650
 
651
  ### πŸ”‘ Getting an API Key:
652
  1. Go to [OpenRouter.ai](https://openrouter.ai)
 
654
  3. Navigate to the API Keys section
655
  4. Create a new API key
656
  5. Copy and paste it in the field above
657
+
658
+ ### ⚠️ Current Limitations:
659
+ - Image and video visual analysis not supported by the free Gemma 3n model
660
+ - Audio transcription requires internet connection for best results
661
+ - Large files may take longer to process
662
  """)
663
 
664
  return demo
 
672
  "Pillow",
673
  "SpeechRecognition",
674
  "opencv-python",
675
+ "numpy",
676
+ "pydub"
677
  ]
678
 
679
  print("πŸš€ Multimodal Chatbot with Gemma 3n")
680
  print("=" * 50)
681
  print("Required packages:", ", ".join(required_packages))
682
  print("\nπŸ“¦ To install: pip install " + " ".join(required_packages))
683
+ print("\n🎀 For audio processing, you may also need:")
684
+ print(" - ffmpeg (for audio conversion)")
685
+ print(" - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)")
686
  print("\nπŸ”‘ Get your API key from: https://openrouter.ai")
687
  print("πŸ’‘ Enter your API key in the web interface when it loads")
688
 
689
  demo = create_interface()
690
  demo.launch(
691
+ share=True
 
 
 
692
  )