shukdevdattaEX commited on
Commit
2b051f4
Β·
verified Β·
1 Parent(s): 11f4277

Create v1.txt

Browse files
Files changed (1) hide show
  1. v1.txt +437 -0
v1.txt ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import base64
3
+ import io
4
+ import os
5
+ from openai import OpenAI
6
+ import PyPDF2
7
+ import speech_recognition as sr
8
+ import tempfile
9
+ from pydub import AudioSegment
10
+ from typing import List, Tuple, Optional
11
+
12
+ class MultimodalChatbot:
13
+ def __init__(self, api_key: str):
14
+ self.client = OpenAI(
15
+ base_url="https://openrouter.ai/api/v1",
16
+ api_key=api_key,
17
+ )
18
+ self.model = "google/gemma-3n-e2b-it:free"
19
+ self.conversation_history = []
20
+
21
+ def extract_pdf_text(self, pdf_file) -> str:
22
+ """Extract text from PDF file"""
23
+ try:
24
+ if hasattr(pdf_file, 'name'):
25
+ pdf_path = pdf_file.name
26
+ else:
27
+ pdf_path = pdf_file
28
+
29
+ text = ""
30
+ with open(pdf_path, 'rb') as file:
31
+ pdf_reader = PyPDF2.PdfReader(file)
32
+ for page_num, page in enumerate(pdf_reader.pages):
33
+ page_text = page.extract_text()
34
+ if page_text.strip():
35
+ text += f"Page {page_num + 1}:\n{page_text}\n\n"
36
+ return text.strip() if text.strip() else "No text could be extracted from this PDF."
37
+ except Exception as e:
38
+ return f"Error extracting PDF: {str(e)}"
39
+
40
+ def convert_audio_to_wav(self, audio_file) -> str:
41
+ """Convert audio file to WAV format for speech recognition"""
42
+ try:
43
+ if hasattr(audio_file, 'name'):
44
+ audio_path = audio_file.name
45
+ else:
46
+ audio_path = audio_file
47
+
48
+ file_ext = os.path.splitext(audio_path)[1].lower()
49
+ if file_ext == '.wav':
50
+ return audio_path
51
+
52
+ audio = AudioSegment.from_file(audio_path)
53
+ wav_path = tempfile.mktemp(suffix='.wav')
54
+ audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
55
+ return wav_path
56
+ except Exception as e:
57
+ raise Exception(f"Error converting audio: {str(e)}")
58
+
59
+ def transcribe_audio(self, audio_file) -> str:
60
+ """Transcribe audio file to text"""
61
+ try:
62
+ recognizer = sr.Recognizer()
63
+ wav_path = self.convert_audio_to_wav(audio_file)
64
+
65
+ with sr.AudioFile(wav_path) as source:
66
+ recognizer.adjust_for_ambient_noise(source, duration=0.2)
67
+ audio_data = recognizer.record(source)
68
+
69
+ try:
70
+ text = recognizer.recognize_google(audio_data)
71
+ return text
72
+ except sr.UnknownValueError:
73
+ return "Could not understand the audio. Please try with clearer audio."
74
+ except sr.RequestError as e:
75
+ try:
76
+ text = recognizer.recognize_sphinx(audio_data)
77
+ return text
78
+ except:
79
+ return f"Speech recognition service error: {str(e)}"
80
+ except Exception as e:
81
+ return f"Error transcribing audio: {str(e)}"
82
+
83
+ def create_multimodal_message(self,
84
+ text_input: str = "",
85
+ pdf_file=None,
86
+ audio_file=None) -> dict:
87
+ """Create a multimodal message for the API"""
88
+ content_parts = []
89
+ processing_info = []
90
+
91
+ if text_input:
92
+ content_parts.append({"type": "text", "text": text_input})
93
+
94
+ if pdf_file is not None:
95
+ pdf_text = self.extract_pdf_text(pdf_file)
96
+ content_parts.append({
97
+ "type": "text",
98
+ "text": f"PDF Content:\n{pdf_text}"
99
+ })
100
+ processing_info.append("πŸ“„ PDF processed")
101
+
102
+ if audio_file is not None:
103
+ audio_text = self.transcribe_audio(audio_file)
104
+ content_parts.append({
105
+ "type": "text",
106
+ "text": f"Audio Transcription:\n{audio_text}"
107
+ })
108
+ processing_info.append("🎀 Audio transcribed")
109
+
110
+ return {"role": "user", "content": content_parts}, processing_info
111
+
112
+ def chat(self,
113
+ text_input: str = "",
114
+ pdf_file=None,
115
+ audio_file=None,
116
+ history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
117
+ """Main chat function"""
118
+ if history is None:
119
+ history = []
120
+
121
+ try:
122
+ user_message_parts = []
123
+ if text_input:
124
+ user_message_parts.append(f"Text: {text_input}")
125
+ if pdf_file:
126
+ user_message_parts.append("πŸ“„ PDF uploaded")
127
+ if audio_file:
128
+ user_message_parts.append("🎀 Audio uploaded")
129
+
130
+ user_display = " | ".join(user_message_parts)
131
+
132
+ user_message, processing_info = self.create_multimodal_message(
133
+ text_input, pdf_file, audio_file
134
+ )
135
+
136
+ if processing_info:
137
+ user_display += f"\n{' | '.join(processing_info)}"
138
+
139
+ messages = [user_message]
140
+
141
+ completion = self.client.chat.completions.create(
142
+ extra_headers={
143
+ "HTTP-Referer": "https://multimodal-chatbot.local",
144
+ "X-Title": "Multimodal Chatbot",
145
+ },
146
+ model=self.model,
147
+ messages=messages,
148
+ max_tokens=2048,
149
+ temperature=0.7
150
+ )
151
+
152
+ bot_response = completion.choices[0].message.content
153
+ history.append((user_display, bot_response))
154
+
155
+ return history, ""
156
+
157
+ except Exception as e:
158
+ error_msg = f"Error: {str(e)}"
159
+ history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
160
+ return history, ""
161
+
162
+ def create_interface():
163
+ """Create the Gradio interface"""
164
+ with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
165
+ gr.Markdown("""
166
+ # πŸ€– Multimodal Chatbot with Gemma 3n
167
+
168
+ This chatbot can process multiple types of input:
169
+ - **Text**: Regular text messages
170
+ - **PDF**: Extract and analyze document content
171
+ - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
172
+
173
+ **Setup**: Enter your OpenRouter API key below to get started
174
+ """)
175
+
176
+ with gr.Row():
177
+ with gr.Column():
178
+ api_key_input = gr.Textbox(
179
+ label="πŸ”‘ OpenRouter API Key",
180
+ placeholder="Enter your OpenRouter API key here...",
181
+ type="password",
182
+ info="Your API key is not stored and only used for this session"
183
+ )
184
+ api_status = gr.Textbox(
185
+ label="Connection Status",
186
+ value="❌ API Key not provided",
187
+ interactive=False
188
+ )
189
+
190
+ with gr.Tabs():
191
+ with gr.TabItem("πŸ’¬ Text Chat"):
192
+ with gr.Row():
193
+ with gr.Column(scale=1):
194
+ text_input = gr.Textbox(
195
+ label="πŸ’¬ Text Input",
196
+ placeholder="Type your message here...",
197
+ lines=5
198
+ )
199
+ text_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
200
+ text_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
201
+
202
+ with gr.Column(scale=2):
203
+ text_chatbot = gr.Chatbot(
204
+ label="Text Chat History",
205
+ height=600,
206
+ bubble_full_width=False,
207
+ show_copy_button=True
208
+ )
209
+
210
+ with gr.TabItem("πŸ“„ PDF Chat"):
211
+ with gr.Row():
212
+ with gr.Column(scale=1):
213
+ pdf_input = gr.File(
214
+ label="πŸ“„ PDF Upload",
215
+ file_types=[".pdf"],
216
+ type="filepath"
217
+ )
218
+ pdf_text_input = gr.Textbox(
219
+ label="πŸ’¬ Question about PDF",
220
+ placeholder="Ask something about the PDF...",
221
+ lines=3
222
+ )
223
+ pdf_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
224
+ pdf_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
225
+
226
+ with gr.Column(scale=2):
227
+ pdf_chatbot = gr.Chatbot(
228
+ label="PDF Chat History",
229
+ height=600,
230
+ bubble_full_width=False,
231
+ show_copy_button=True
232
+ )
233
+
234
+ with gr.TabItem("🎀 Audio Chat"):
235
+ with gr.Row():
236
+ with gr.Column(scale=1):
237
+ audio_input = gr.File(
238
+ label="🎀 Audio Upload",
239
+ file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
240
+ type="filepath"
241
+ )
242
+ audio_text_input = gr.Textbox(
243
+ label="πŸ’¬ Question about Audio",
244
+ placeholder="Ask something about the audio...",
245
+ lines=3
246
+ )
247
+ audio_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
248
+ audio_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
249
+
250
+ with gr.Column(scale=2):
251
+ audio_chatbot = gr.Chatbot(
252
+ label="Audio Chat History",
253
+ height=600,
254
+ bubble_full_width=False,
255
+ show_copy_button=True
256
+ )
257
+
258
+ with gr.TabItem("🌟 Combined Chat"):
259
+ with gr.Row():
260
+ with gr.Column(scale=1):
261
+ combined_text_input = gr.Textbox(
262
+ label="πŸ’¬ Text Input",
263
+ placeholder="Type your message here...",
264
+ lines=3
265
+ )
266
+ combined_pdf_input = gr.File(
267
+ label="πŸ“„ PDF Upload",
268
+ file_types=[".pdf"],
269
+ type="filepath"
270
+ )
271
+ combined_audio_input = gr.File(
272
+ label="🎀 Audio Upload",
273
+ file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
274
+ type="filepath"
275
+ )
276
+ combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
277
+ combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
278
+
279
+ with gr.Column(scale=2):
280
+ combined_chatbot = gr.Chatbot(
281
+ label="Combined Chat History",
282
+ height=600,
283
+ bubble_full_width=False,
284
+ show_copy_button=True
285
+ )
286
+
287
+ def validate_api_key(api_key):
288
+ if not api_key or len(api_key.strip()) == 0:
289
+ return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(4)]
290
+
291
+ try:
292
+ test_client = OpenAI(
293
+ base_url="https://openrouter.ai/api/v1",
294
+ api_key=api_key.strip(),
295
+ )
296
+ return "βœ… API Key validated successfully", *[gr.update(interactive=True) for _ in range(4)]
297
+ except Exception as e:
298
+ return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(4)]
299
+
300
+ def process_text_input(api_key, text, history):
301
+ if not api_key or len(api_key.strip()) == 0:
302
+ if history is None:
303
+ history = []
304
+ history.append(("Error", "❌ Please provide a valid API key first"))
305
+ return history, ""
306
+
307
+ chatbot = MultimodalChatbot(api_key.strip())
308
+ return chatbot.chat(text_input=text, history=history)
309
+
310
+ def process_pdf_input(api_key, pdf, text, history):
311
+ if not api_key or len(api_key.strip()) == 0:
312
+ if history is None:
313
+ history = []
314
+ history.append(("Error", "❌ Please provide a valid API key first"))
315
+ return history, ""
316
+
317
+ chatbot = MultimodalChatbot(api_key.strip())
318
+ return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
319
+
320
+ def process_audio_input(api_key, audio, text, history):
321
+ if not api_key or len(api_key.strip()) == 0:
322
+ if history is None:
323
+ history = []
324
+ history.append(("Error", "❌ Please provide a valid API key first"))
325
+ return history, ""
326
+
327
+ chatbot = MultimodalChatbot(api_key.strip())
328
+ return chatbot.chat(text_input=text, audio_file=audio, history=history)
329
+
330
+ def process_combined_input(api_key, text, pdf, audio, history):
331
+ if not api_key or len(api_key.strip()) == 0:
332
+ if history is None:
333
+ history = []
334
+ history.append(("Error", "❌ Please provide a valid API key first"))
335
+ return history, ""
336
+
337
+ chatbot = MultimodalChatbot(api_key.strip())
338
+ return chatbot.chat(text, pdf, audio, history)
339
+
340
+ def clear_chat():
341
+ return [], ""
342
+
343
+ def clear_all_inputs():
344
+ return [], "", None, None
345
+
346
+ api_key_input.change(
347
+ validate_api_key,
348
+ inputs=[api_key_input],
349
+ outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, combined_submit_btn]
350
+ )
351
+
352
+ text_submit_btn.click(
353
+ process_text_input,
354
+ inputs=[api_key_input, text_input, text_chatbot],
355
+ outputs=[text_chatbot, text_input]
356
+ )
357
+ text_input.submit(
358
+ process_text_input,
359
+ inputs=[api_key_input, text_input, text_chatbot],
360
+ outputs=[text_chatbot, text_input]
361
+ )
362
+ text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
363
+
364
+ pdf_submit_btn.click(
365
+ process_pdf_input,
366
+ inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
367
+ outputs=[pdf_chatbot, pdf_text_input]
368
+ )
369
+ pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
370
+
371
+ audio_submit_btn.click(
372
+ process_audio_input,
373
+ inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
374
+ outputs=[audio_chatbot, audio_text_input]
375
+ )
376
+ audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
377
+
378
+ combined_submit_btn.click(
379
+ process_combined_input,
380
+ inputs=[api_key_input, combined_text_input, combined_pdf_input,
381
+ combined_audio_input, combined_chatbot],
382
+ outputs=[combined_chatbot, combined_text_input]
383
+ )
384
+ combined_clear_btn.click(clear_all_inputs,
385
+ outputs=[combined_chatbot, combined_text_input,
386
+ combined_pdf_input, combined_audio_input])
387
+
388
+ gr.Markdown("""
389
+ ### 🎯 How to Use Each Tab:
390
+
391
+ **πŸ’¬ Text Chat**: Simple text conversations with the AI
392
+
393
+ **πŸ“„ PDF Chat**: Upload a PDF and ask questions about its content
394
+
395
+ **🎀 Audio Chat**: Upload audio files for transcription and analysis
396
+ - Supports: WAV, MP3, M4A, FLAC, OGG formats
397
+ - Best results with clear speech and minimal background noise
398
+
399
+ **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
400
+
401
+ ### πŸ”‘ Getting an API Key:
402
+ 1. Go to [OpenRouter.ai](https://openrouter.ai)
403
+ 2. Sign up for an account
404
+ 3. Navigate to the API Keys section
405
+ 4. Create a new API key
406
+ 5. Copy and paste it in the field above
407
+
408
+ ### ⚠️ Current Limitations:
409
+ - Audio transcription requires internet connection for best results
410
+ - Large files may take longer to process
411
+ """)
412
+
413
+ return demo
414
+
415
+ if __name__ == "__main__":
416
+ required_packages = [
417
+ "gradio",
418
+ "openai",
419
+ "PyPDF2",
420
+ "SpeechRecognition",
421
+ "pydub"
422
+ ]
423
+
424
+ print("πŸš€ Multimodal Chatbot with Gemma 3n")
425
+ print("=" * 50)
426
+ print("Required packages:", ", ".join(required_packages))
427
+ print("\nπŸ“¦ To install: pip install " + " ".join(required_packages))
428
+ print("\n🎀 For audio processing, you may also need:")
429
+ print(" - ffmpeg (for audio conversion)")
430
+ print(" - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)")
431
+ print("\nπŸ”‘ Get your API key from: https://openrouter.ai")
432
+ print("πŸ’‘ Enter your API key in the web interface when it loads")
433
+
434
+ demo = create_interface()
435
+ demo.launch(
436
+ share=True
437
+ )