shukdevdattaEX commited on
Commit
38b2ece
Β·
verified Β·
1 Parent(s): a220a8f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -0
app.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import base64
3
+ import io
4
+ import os
5
+ from openai import OpenAI
6
+ import PyPDF2
7
+ from PIL import Image
8
+ import speech_recognition as sr
9
+ import tempfile
10
+ import cv2
11
+ import numpy as np
12
+ from typing import List, Tuple, Optional
13
+ import json
14
+
15
+ class MultimodalChatbot:
16
+ def __init__(self, api_key: str):
17
+ self.client = OpenAI(
18
+ base_url="https://openrouter.ai/api/v1",
19
+ api_key=api_key,
20
+ )
21
+ self.model = "google/gemma-3n-e2b-it:free"
22
+ self.conversation_history = []
23
+
24
+ def encode_image_to_base64(self, image) -> str:
25
+ """Convert PIL Image to base64 string"""
26
+ if isinstance(image, str):
27
+ # If it's a file path
28
+ with open(image, "rb") as img_file:
29
+ return base64.b64encode(img_file.read()).decode('utf-8')
30
+ else:
31
+ # If it's a PIL Image
32
+ buffered = io.BytesIO()
33
+ image.save(buffered, format="PNG")
34
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')
35
+
36
+ def extract_pdf_text(self, pdf_file) -> str:
37
+ """Extract text from PDF file"""
38
+ try:
39
+ if hasattr(pdf_file, 'name'):
40
+ # Gradio file object
41
+ pdf_path = pdf_file.name
42
+ else:
43
+ pdf_path = pdf_file
44
+
45
+ text = ""
46
+ with open(pdf_path, 'rb') as file:
47
+ pdf_reader = PyPDF2.PdfReader(file)
48
+ for page in pdf_reader.pages:
49
+ text += page.extract_text() + "\n"
50
+ return text.strip()
51
+ except Exception as e:
52
+ return f"Error extracting PDF: {str(e)}"
53
+
54
+ def transcribe_audio(self, audio_file) -> str:
55
+ """Transcribe audio file to text"""
56
+ try:
57
+ recognizer = sr.Recognizer()
58
+
59
+ if hasattr(audio_file, 'name'):
60
+ audio_path = audio_file.name
61
+ else:
62
+ audio_path = audio_file
63
+
64
+ with sr.AudioFile(audio_path) as source:
65
+ audio_data = recognizer.record(source)
66
+ text = recognizer.recognize_google(audio_data)
67
+ return text
68
+ except Exception as e:
69
+ return f"Error transcribing audio: {str(e)}"
70
+
71
+ def process_video(self, video_file) -> List[str]:
72
+ """Extract frames from video and convert to base64"""
73
+ try:
74
+ if hasattr(video_file, 'name'):
75
+ video_path = video_file.name
76
+ else:
77
+ video_path = video_file
78
+
79
+ cap = cv2.VideoCapture(video_path)
80
+ frames = []
81
+ frame_count = 0
82
+
83
+ # Extract frames (every 30 frames to avoid too many)
84
+ while cap.read()[0] and frame_count < 10: # Limit to 10 frames
85
+ ret, frame = cap.read()
86
+ if ret and frame_count % 30 == 0:
87
+ # Convert BGR to RGB
88
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
89
+ pil_image = Image.fromarray(rgb_frame)
90
+ base64_frame = self.encode_image_to_base64(pil_image)
91
+ frames.append(base64_frame)
92
+ frame_count += 1
93
+
94
+ cap.release()
95
+ return frames
96
+ except Exception as e:
97
+ return [f"Error processing video: {str(e)}"]
98
+
99
+ def create_multimodal_message(self,
100
+ text_input: str = "",
101
+ pdf_file=None,
102
+ audio_file=None,
103
+ image_file=None,
104
+ video_file=None) -> dict:
105
+ """Create a multimodal message for the API"""
106
+
107
+ content_parts = []
108
+
109
+ # Add text content
110
+ if text_input:
111
+ content_parts.append({"type": "text", "text": text_input})
112
+
113
+ # Process PDF
114
+ if pdf_file is not None:
115
+ pdf_text = self.extract_pdf_text(pdf_file)
116
+ content_parts.append({
117
+ "type": "text",
118
+ "text": f"PDF Content:\n{pdf_text}"
119
+ })
120
+
121
+ # Process Audio
122
+ if audio_file is not None:
123
+ audio_text = self.transcribe_audio(audio_file)
124
+ content_parts.append({
125
+ "type": "text",
126
+ "text": f"Audio Transcription:\n{audio_text}"
127
+ })
128
+
129
+ # Process Image
130
+ if image_file is not None:
131
+ image_base64 = self.encode_image_to_base64(image_file)
132
+ content_parts.append({
133
+ "type": "image_url",
134
+ "image_url": {
135
+ "url": f"data:image/png;base64,{image_base64}"
136
+ }
137
+ })
138
+
139
+ # Process Video
140
+ if video_file is not None:
141
+ video_frames = self.process_video(video_file)
142
+ for i, frame_base64 in enumerate(video_frames):
143
+ if not frame_base64.startswith("Error"):
144
+ content_parts.append({
145
+ "type": "image_url",
146
+ "image_url": {
147
+ "url": f"data:image/png;base64,{frame_base64}"
148
+ }
149
+ })
150
+
151
+ return {"role": "user", "content": content_parts}
152
+
153
+ def chat(self,
154
+ text_input: str = "",
155
+ pdf_file=None,
156
+ audio_file=None,
157
+ image_file=None,
158
+ video_file=None,
159
+ history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
160
+ """Main chat function"""
161
+
162
+ if history is None:
163
+ history = []
164
+
165
+ try:
166
+ # Create user message summary for display
167
+ user_message_parts = []
168
+ if text_input:
169
+ user_message_parts.append(f"Text: {text_input}")
170
+ if pdf_file:
171
+ user_message_parts.append("πŸ“„ PDF uploaded")
172
+ if audio_file:
173
+ user_message_parts.append("🎀 Audio uploaded")
174
+ if image_file:
175
+ user_message_parts.append("πŸ–ΌοΈ Image uploaded")
176
+ if video_file:
177
+ user_message_parts.append("πŸŽ₯ Video uploaded")
178
+
179
+ user_display = " | ".join(user_message_parts)
180
+
181
+ # Create multimodal message
182
+ user_message = self.create_multimodal_message(
183
+ text_input, pdf_file, audio_file, image_file, video_file
184
+ )
185
+
186
+ # Add to conversation history
187
+ messages = [user_message]
188
+
189
+ # Get response from Gemma
190
+ completion = self.client.chat.completions.create(
191
+ extra_headers={
192
+ "HTTP-Referer": "https://multimodal-chatbot.local",
193
+ "X-Title": "Multimodal Chatbot",
194
+ },
195
+ model=self.model,
196
+ messages=messages,
197
+ max_tokens=1024,
198
+ temperature=0.7
199
+ )
200
+
201
+ bot_response = completion.choices[0].message.content
202
+
203
+ # Update history
204
+ history.append((user_display, bot_response))
205
+
206
+ return history, ""
207
+
208
+ except Exception as e:
209
+ error_msg = f"Error: {str(e)}"
210
+ history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
211
+ return history, ""
212
+
213
+ def create_interface():
214
+ """Create the Gradio interface"""
215
+
216
+ # Initialize chatbot (you'll need to set your API key)
217
+ api_key = os.getenv("OPENROUTER_API_KEY", "your_api_key_here")
218
+ chatbot = MultimodalChatbot(api_key)
219
+
220
+ with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
221
+ gr.Markdown("""
222
+ # πŸ€– Multimodal Chatbot with Gemma 3n
223
+
224
+ This chatbot can process multiple types of input:
225
+ - **Text**: Regular text messages
226
+ - **PDF**: Extract and analyze document content
227
+ - **Audio**: Transcribe speech to text
228
+ - **Images**: Analyze visual content
229
+ - **Video**: Extract frames and analyze video content
230
+
231
+ **Setup**: Set your OpenRouter API key as an environment variable `OPENROUTER_API_KEY`
232
+ """)
233
+
234
+ with gr.Row():
235
+ with gr.Column(scale=1):
236
+ # Input components
237
+ text_input = gr.Textbox(
238
+ label="πŸ’¬ Text Input",
239
+ placeholder="Type your message here...",
240
+ lines=3
241
+ )
242
+
243
+ pdf_input = gr.File(
244
+ label="πŸ“„ PDF Upload",
245
+ file_types=[".pdf"],
246
+ type="filepath"
247
+ )
248
+
249
+ audio_input = gr.File(
250
+ label="🎀 Audio Upload",
251
+ file_types=[".wav", ".mp3", ".m4a", ".flac"],
252
+ type="filepath"
253
+ )
254
+
255
+ image_input = gr.Image(
256
+ label="πŸ–ΌοΈ Image Upload",
257
+ type="pil"
258
+ )
259
+
260
+ video_input = gr.File(
261
+ label="πŸŽ₯ Video Upload",
262
+ file_types=[".mp4", ".avi", ".mov", ".mkv"],
263
+ type="filepath"
264
+ )
265
+
266
+ submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg")
267
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
268
+
269
+ with gr.Column(scale=2):
270
+ # Chat interface
271
+ chatbot_interface = gr.Chatbot(
272
+ label="Chat History",
273
+ height=600,
274
+ bubble_full_width=False
275
+ )
276
+
277
+ # Event handlers
278
+ def process_input(text, pdf, audio, image, video, history):
279
+ return chatbot.chat(text, pdf, audio, image, video, history)
280
+
281
+ def clear_all():
282
+ return [], "", None, None, None, None
283
+
284
+ # Button events
285
+ submit_btn.click(
286
+ process_input,
287
+ inputs=[text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
288
+ outputs=[chatbot_interface, text_input]
289
+ )
290
+
291
+ clear_btn.click(
292
+ clear_all,
293
+ outputs=[chatbot_interface, text_input, pdf_input, audio_input, image_input, video_input]
294
+ )
295
+
296
+ # Enter key support
297
+ text_input.submit(
298
+ process_input,
299
+ inputs=[text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
300
+ outputs=[chatbot_interface, text_input]
301
+ )
302
+
303
+ # Examples
304
+ gr.Markdown("""
305
+ ### 🎯 Example Usage:
306
+ - Upload a PDF and ask "Summarize this document"
307
+ - Upload an image and ask "What do you see in this image?"
308
+ - Record audio and ask "What did I say?"
309
+ - Upload a video and ask "Describe what's happening"
310
+ - Combine multiple inputs: "Compare this image with the PDF content"
311
+ """)
312
+
313
+ return demo
314
+
315
+ if __name__ == "__main__":
316
+ # Required packages (install with pip):
317
+ required_packages = [
318
+ "gradio",
319
+ "openai",
320
+ "PyPDF2",
321
+ "Pillow",
322
+ "SpeechRecognition",
323
+ "opencv-python",
324
+ "numpy"
325
+ ]
326
+
327
+ print("Required packages:", ", ".join(required_packages))
328
+ print("\nTo install: pip install " + " ".join(required_packages))
329
+ print("\nDon't forget to set your OPENROUTER_API_KEY environment variable!")
330
+
331
+ demo = create_interface()
332
+ demo.launch(
333
+ share=True
334
+ )