shukdevdattaEX commited on
Commit
a660b93
Β·
verified Β·
1 Parent(s): dcec17f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -14
app.py CHANGED
@@ -80,10 +80,39 @@ class MultimodalChatbot:
80
  except Exception as e:
81
  return f"Error transcribing audio: {str(e)}"
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  def create_multimodal_message(self,
84
  text_input: str = "",
85
  pdf_file=None,
86
- audio_file=None) -> dict:
 
87
  """Create a multimodal message for the API"""
88
  content_parts = []
89
  processing_info = []
@@ -107,12 +136,21 @@ class MultimodalChatbot:
107
  })
108
  processing_info.append("🎀 Audio transcribed")
109
 
 
 
 
 
 
 
 
 
110
  return {"role": "user", "content": content_parts}, processing_info
111
 
112
  def chat(self,
113
  text_input: str = "",
114
  pdf_file=None,
115
  audio_file=None,
 
116
  history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
117
  """Main chat function"""
118
  if history is None:
@@ -126,11 +164,13 @@ class MultimodalChatbot:
126
  user_message_parts.append("πŸ“„ PDF uploaded")
127
  if audio_file:
128
  user_message_parts.append("🎀 Audio uploaded")
 
 
129
 
130
  user_display = " | ".join(user_message_parts)
131
 
132
  user_message, processing_info = self.create_multimodal_message(
133
- text_input, pdf_file, audio_file
134
  )
135
 
136
  if processing_info:
@@ -168,7 +208,7 @@ def create_interface():
168
  This chatbot can process multiple types of input:
169
  - **Text**: Regular text messages
170
  - **PDF**: Extract and analyze document content
171
- - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
172
 
173
  **Setup**: Enter your OpenRouter API key below to get started
174
  """)
@@ -239,6 +279,11 @@ def create_interface():
239
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
240
  type="filepath"
241
  )
 
 
 
 
 
242
  audio_text_input = gr.Textbox(
243
  label="πŸ’¬ Question about Audio",
244
  placeholder="Ask something about the audio...",
@@ -273,6 +318,11 @@ def create_interface():
273
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
274
  type="filepath"
275
  )
 
 
 
 
 
276
  combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
277
  combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
278
 
@@ -317,7 +367,7 @@ def create_interface():
317
  chatbot = MultimodalChatbot(api_key.strip())
318
  return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
319
 
320
- def process_audio_input(api_key, audio, text, history):
321
  if not api_key or len(api_key.strip()) == 0:
322
  if history is None:
323
  history = []
@@ -325,9 +375,9 @@ def create_interface():
325
  return history, ""
326
 
327
  chatbot = MultimodalChatbot(api_key.strip())
328
- return chatbot.chat(text_input=text, audio_file=audio, history=history)
329
 
330
- def process_combined_input(api_key, text, pdf, audio, history):
331
  if not api_key or len(api_key.strip()) == 0:
332
  if history is None:
333
  history = []
@@ -335,14 +385,17 @@ def create_interface():
335
  return history, ""
336
 
337
  chatbot = MultimodalChatbot(api_key.strip())
338
- return chatbot.chat(text, pdf, audio, history)
339
 
340
  def clear_chat():
341
  return [], ""
342
 
343
- def clear_all_inputs():
344
  return [], "", None, None
345
 
 
 
 
346
  api_key_input.change(
347
  validate_api_key,
348
  inputs=[api_key_input],
@@ -370,20 +423,21 @@ def create_interface():
370
 
371
  audio_submit_btn.click(
372
  process_audio_input,
373
- inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
374
  outputs=[audio_chatbot, audio_text_input]
375
  )
376
- audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
377
 
378
  combined_submit_btn.click(
379
  process_combined_input,
380
  inputs=[api_key_input, combined_text_input, combined_pdf_input,
381
- combined_audio_input, combined_chatbot],
382
  outputs=[combined_chatbot, combined_text_input]
383
  )
384
  combined_clear_btn.click(clear_all_inputs,
385
  outputs=[combined_chatbot, combined_text_input,
386
- combined_pdf_input, combined_audio_input])
 
387
 
388
  gr.Markdown("""
389
  ### 🎯 How to Use Each Tab:
@@ -392,8 +446,9 @@ def create_interface():
392
 
393
  **πŸ“„ PDF Chat**: Upload a PDF and ask questions about its content
394
 
395
- **🎀 Audio Chat**: Upload audio files for transcription and analysis
396
- - Supports: WAV, MP3, M4A, FLAC, OGG formats
 
397
  - Best results with clear speech and minimal background noise
398
 
399
  **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
@@ -408,6 +463,7 @@ def create_interface():
408
  ### ⚠️ Current Limitations:
409
  - Audio transcription requires internet connection for best results
410
  - Large files may take longer to process
 
411
  """)
412
 
413
  return demo
 
80
  except Exception as e:
81
  return f"Error transcribing audio: {str(e)}"
82
 
83
+ def transcribe_recorded_audio(self, audio_data) -> str:
84
+ """Transcribe recorded audio to text"""
85
+ try:
86
+ recognizer = sr.Recognizer()
87
+ wav_path = tempfile.mktemp(suffix='.wav')
88
+
89
+ # Convert raw audio data to WAV
90
+ audio = AudioSegment.from_file(io.BytesIO(audio_data), format="wav")
91
+ audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
92
+
93
+ with sr.AudioFile(wav_path) as source:
94
+ recognizer.adjust_for_ambient_noise(source, duration=0.2)
95
+ audio_data = recognizer.record(source)
96
+
97
+ try:
98
+ text = recognizer.recognize_google(audio_data)
99
+ return text
100
+ except sr.UnknownValueError:
101
+ return "Could not understand the recorded audio. Please try with clearer audio."
102
+ except sr.RequestError as e:
103
+ try:
104
+ text = recognizer.recognize_sphinx(audio_data)
105
+ return text
106
+ except:
107
+ return f"Speech recognition service error: {str(e)}"
108
+ except Exception as e:
109
+ return f"Error transcribing recorded audio: {str(e)}"
110
+
111
  def create_multimodal_message(self,
112
  text_input: str = "",
113
  pdf_file=None,
114
+ audio_file=None,
115
+ recorded_audio=None) -> dict:
116
  """Create a multimodal message for the API"""
117
  content_parts = []
118
  processing_info = []
 
136
  })
137
  processing_info.append("🎀 Audio transcribed")
138
 
139
+ if recorded_audio is not None:
140
+ audio_text = self.transcribe_recorded_audio(recorded_audio)
141
+ content_parts.append({
142
+ "type": "text",
143
+ "text": f"Recorded Audio Transcription:\n{audio_text}"
144
+ })
145
+ processing_info.append("πŸŽ™οΈ Recorded audio transcribed")
146
+
147
  return {"role": "user", "content": content_parts}, processing_info
148
 
149
  def chat(self,
150
  text_input: str = "",
151
  pdf_file=None,
152
  audio_file=None,
153
+ recorded_audio=None,
154
  history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
155
  """Main chat function"""
156
  if history is None:
 
164
  user_message_parts.append("πŸ“„ PDF uploaded")
165
  if audio_file:
166
  user_message_parts.append("🎀 Audio uploaded")
167
+ if recorded_audio:
168
+ user_message_parts.append("πŸŽ™οΈ Recorded audio")
169
 
170
  user_display = " | ".join(user_message_parts)
171
 
172
  user_message, processing_info = self.create_multimodal_message(
173
+ text_input, pdf_file, audio_file, recorded_audio
174
  )
175
 
176
  if processing_info:
 
208
  This chatbot can process multiple types of input:
209
  - **Text**: Regular text messages
210
  - **PDF**: Extract and analyze document content
211
+ - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC, recorded audio)
212
 
213
  **Setup**: Enter your OpenRouter API key below to get started
214
  """)
 
279
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
280
  type="filepath"
281
  )
282
+ audio_recorder = gr.Audio(
283
+ label="πŸŽ™οΈ Record Audio",
284
+ source="microphone",
285
+ type="numpy"
286
+ )
287
  audio_text_input = gr.Textbox(
288
  label="πŸ’¬ Question about Audio",
289
  placeholder="Ask something about the audio...",
 
318
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
319
  type="filepath"
320
  )
321
+ combined_audio_recorder = gr.Audio(
322
+ label="πŸŽ™οΈ Record Audio",
323
+ source="microphone",
324
+ type="numpy"
325
+ )
326
  combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
327
  combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
328
 
 
367
  chatbot = MultimodalChatbot(api_key.strip())
368
  return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
369
 
370
+ def process_audio_input(api_key, audio, recorded_audio, text, history):
371
  if not api_key or len(api_key.strip()) == 0:
372
  if history is None:
373
  history = []
 
375
  return history, ""
376
 
377
  chatbot = MultimodalChatbot(api_key.strip())
378
+ return chatbot.chat(text_input=text, audio_file=audio, recorded_audio=recorded_audio, history=history)
379
 
380
+ def process_combined_input(api_key, text, pdf, audio, recorded_audio, history):
381
  if not api_key or len(api_key.strip()) == 0:
382
  if history is None:
383
  history = []
 
385
  return history, ""
386
 
387
  chatbot = MultimodalChatbot(api_key.strip())
388
+ return chatbot.chat(text, pdf, audio, recorded_audio, history)
389
 
390
  def clear_chat():
391
  return [], ""
392
 
393
+ def clear_audio_inputs():
394
  return [], "", None, None
395
 
396
+ def clear_all_inputs():
397
+ return [], "", None, None, None
398
+
399
  api_key_input.change(
400
  validate_api_key,
401
  inputs=[api_key_input],
 
423
 
424
  audio_submit_btn.click(
425
  process_audio_input,
426
+ inputs=[api_key_input, audio_input, audio_recorder, audio_text_input, audio_chatbot],
427
  outputs=[audio_chatbot, audio_text_input]
428
  )
429
+ audio_clear_btn.click(clear_audio_inputs, outputs=[audio_chatbot, audio_text_input, audio_input, audio_recorder])
430
 
431
  combined_submit_btn.click(
432
  process_combined_input,
433
  inputs=[api_key_input, combined_text_input, combined_pdf_input,
434
+ combined_audio_input, combined_audio_recorder, combined_chatbot],
435
  outputs=[combined_chatbot, combined_text_input]
436
  )
437
  combined_clear_btn.click(clear_all_inputs,
438
  outputs=[combined_chatbot, combined_text_input,
439
+ combined_pdf_input, combined_audio_input,
440
+ combined_audio_recorder])
441
 
442
  gr.Markdown("""
443
  ### 🎯 How to Use Each Tab:
 
446
 
447
  **πŸ“„ PDF Chat**: Upload a PDF and ask questions about its content
448
 
449
+ **🎀 Audio Chat**: Upload or record audio files for transcription and analysis
450
+ - Supports: WAV, MP3, M4A, FLAC, OGG formats for uploads
451
+ - Recorded audio is processed directly from your microphone
452
  - Best results with clear speech and minimal background noise
453
 
454
  **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
 
463
  ### ⚠️ Current Limitations:
464
  - Audio transcription requires internet connection for best results
465
  - Large files may take longer to process
466
+ - Recorded audio quality depends on your microphone
467
  """)
468
 
469
  return demo