raksama19 commited on
Commit
3ef4ad0
Β·
verified Β·
1 Parent(s): c1ea599

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -4
app.py CHANGED
@@ -30,6 +30,19 @@ import io
30
  from utils.utils import *
31
  from utils.markdown_utils import MarkdownConverter
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Math extension is optional for enhanced math rendering
34
  MATH_EXTENSION_AVAILABLE = False
35
  try:
@@ -38,6 +51,19 @@ try:
38
  except ImportError:
39
  pass
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  class DOLPHIN:
43
  def __init__(self, model_id_or_path):
@@ -487,6 +513,149 @@ show_results_tab = False
487
  document_chunks = []
488
  document_embeddings = None
489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
 
491
  def chunk_document(text, chunk_size=1024, overlap=100):
492
  """Split document into overlapping chunks for RAG"""
@@ -645,15 +814,22 @@ with gr.Blocks(
645
  # Home Tab
646
  with gr.TabItem("🏠 Home", id="home"):
647
  embedding_status = "βœ… RAG ready" if embedding_model else "❌ RAG not loaded"
 
648
  gr.Markdown(
649
- "# Scholar Express - Local Gemma 3n Version\n"
650
- "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by local Gemma 3n.\n"
651
  f"**System:** {model_status}\n"
652
  f"**RAG System:** {embedding_status}\n"
 
653
  f"**DOLPHIN:** Local model for PDF processing\n"
654
- f"**Gemma 3n:** Local model for alt text generation and chat\n"
655
  f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
656
- f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}"
 
 
 
 
 
657
  )
658
 
659
  with gr.Column(elem_classes="upload-container"):
@@ -727,6 +903,51 @@ with gr.Blocks(
727
  "*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
728
  elem_id="chat-notice"
729
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
 
731
  # Event handlers
732
  process_btn.click(
@@ -814,6 +1035,46 @@ Please provide a clear and helpful answer based on the context provided."""
814
  lambda: "",
815
  outputs=[msg_input]
816
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
 
818
 
819
  if __name__ == "__main__":
 
30
  from utils.utils import *
31
  from utils.markdown_utils import MarkdownConverter
32
 
33
+ # Voice functionality imports
34
+ import time
35
+ import librosa
36
+ from dataclasses import dataclass, field
37
+ from pydub import AudioSegment
38
+ try:
39
+ from voice_chat.utils.vad import get_speech_timestamps, collect_chunks, VadOptions
40
+ from voice_chat.gemma3n_inference import Gemma3nInference
41
+ VOICE_DEPENDENCIES_AVAILABLE = True
42
+ except ImportError as e:
43
+ print(f"Voice dependencies not available: {e}")
44
+ VOICE_DEPENDENCIES_AVAILABLE = False
45
+
46
  # Math extension is optional for enhanced math rendering
47
  MATH_EXTENSION_AVAILABLE = False
48
  try:
 
51
  except ImportError:
52
  pass
53
 
54
+ # Initialize voice model early to avoid NameError
55
+ voice_model = None
56
+ if VOICE_DEPENDENCIES_AVAILABLE:
57
+ try:
58
+ print("Loading voice model...")
59
+ voice_model = Gemma3nInference(device='cuda' if torch.cuda.is_available() else 'cpu')
60
+ print("Warming up voice model...")
61
+ voice_model.warm_up()
62
+ print("βœ… Voice model loaded and warmed up successfully")
63
+ except Exception as e:
64
+ print(f"⚠️ Voice model initialization failed: {e}")
65
+ voice_model = None
66
+
67
 
68
  class DOLPHIN:
69
  def __init__(self, model_id_or_path):
 
513
  document_chunks = []
514
  document_embeddings = None
515
 
516
+ # Voice chat parameters and state
517
+ IN_CHANNELS = 1
518
+ IN_RATE = 24000
519
+ IN_CHUNK = 1024
520
+ IN_SAMPLE_WIDTH = 2
521
+ VAD_STRIDE = 0.5
522
+ OUT_CHANNELS = 1
523
+ OUT_RATE = 24000
524
+ OUT_SAMPLE_WIDTH = 2
525
+ OUT_CHUNK = 20 * 4096
526
+
527
+ # Voice model already initialized earlier in the file
528
+
529
+ @dataclass
530
+ class VoiceAppState:
531
+ stream: np.ndarray | None = None
532
+ sampling_rate: int = 0
533
+ pause_detected: bool = False
534
+ started_talking: bool = False
535
+ stopped: bool = False
536
+ conversation: list = field(default_factory=list)
537
+
538
+
539
+ # Voice functionality
540
+ def run_vad(ori_audio, sr):
541
+ """Voice Activity Detection"""
542
+ _st = time.time()
543
+ try:
544
+ audio = ori_audio
545
+ if isinstance(audio, bytes):
546
+ audio = np.frombuffer(audio, dtype=np.int16)
547
+ audio = audio.astype(np.float32) / 32768.0
548
+ sampling_rate = 16000
549
+ if sr != sampling_rate:
550
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
551
+
552
+ vad_parameters = {}
553
+ vad_parameters = VadOptions(**vad_parameters)
554
+ speech_chunks = get_speech_timestamps(audio, vad_parameters)
555
+ audio = collect_chunks(audio, speech_chunks)
556
+ duration_after_vad = audio.shape[0] / sampling_rate
557
+
558
+ if sr != sampling_rate:
559
+ vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
560
+ else:
561
+ vad_audio = audio
562
+ vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
563
+ vad_audio_bytes = vad_audio.tobytes()
564
+
565
+ return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
566
+ except Exception as e:
567
+ msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {e}"
568
+ print(msg)
569
+ return -1, ori_audio, round(time.time() - _st, 4)
570
+
571
+ def determine_pause(audio: np.ndarray, sampling_rate: int, state: VoiceAppState) -> bool:
572
+ """Determine if a pause happened in the audio stream"""
573
+ temp_audio = audio
574
+ dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
575
+ duration = len(audio) / sampling_rate
576
+
577
+ if dur_vad > 0.5 and not state.started_talking:
578
+ print("started talking")
579
+ state.started_talking = True
580
+ return False
581
+
582
+ print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
583
+ return (duration - dur_vad) > 1
584
+
585
+ def process_voice_audio(audio: tuple, state: VoiceAppState):
586
+ """Process streaming audio input"""
587
+ if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
588
+ return None, state
589
+
590
+ if state.stream is None:
591
+ state.stream = audio[1]
592
+ state.sampling_rate = audio[0]
593
+ else:
594
+ state.stream = np.concatenate((state.stream, audio[1]))
595
+
596
+ pause_detected = determine_pause(state.stream, state.sampling_rate, state)
597
+ state.pause_detected = pause_detected
598
+
599
+ if state.pause_detected and state.started_talking:
600
+ return gr.Audio(recording=False), state
601
+ return None, state
602
+
603
+ def generate_voice_response(state: VoiceAppState):
604
+ """Generate voice response from audio input"""
605
+ if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
606
+ return None, VoiceAppState()
607
+
608
+ if not state.pause_detected and not state.started_talking:
609
+ return None, VoiceAppState()
610
+
611
+ try:
612
+ audio_buffer = io.BytesIO()
613
+ segment = AudioSegment(
614
+ state.stream.tobytes(),
615
+ frame_rate=state.sampling_rate,
616
+ sample_width=state.stream.dtype.itemsize,
617
+ channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
618
+ )
619
+ segment.export(audio_buffer, format="wav")
620
+
621
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
622
+ f.write(audio_buffer.getvalue())
623
+ temp_audio_path = f.name
624
+
625
+ try:
626
+ # Generate text response from audio
627
+ text_response = voice_model.generate_response(temp_audio_path)
628
+ print(f"Generated voice response: {text_response}")
629
+
630
+ # Convert text to speech
631
+ audio_response = voice_model.text_to_speech_simple(text_response)
632
+
633
+ # Convert to format expected by Gradio
634
+ audio_segment = AudioSegment.from_file(io.BytesIO(audio_response), format="wav")
635
+ audio_array = np.array(audio_segment.get_array_of_samples())
636
+
637
+ if audio_segment.channels == 2:
638
+ audio_array = audio_array.reshape((-1, 2))
639
+
640
+ # Update conversation history
641
+ state.conversation.append({"role": "user", "content": {"path": temp_audio_path, "mime_type": "audio/wav"}})
642
+ state.conversation.append({"role": "assistant", "content": {"text": text_response}})
643
+
644
+ return (audio_segment.frame_rate, audio_array), VoiceAppState(conversation=state.conversation)
645
+
646
+ finally:
647
+ if os.path.exists(temp_audio_path):
648
+ os.unlink(temp_audio_path)
649
+
650
+ except Exception as e:
651
+ print(f"Error generating voice response: {e}")
652
+ return None, VoiceAppState()
653
+
654
+ def start_voice_recording(state: VoiceAppState):
655
+ """Start recording user voice input"""
656
+ if not state.stopped:
657
+ return gr.Audio(recording=True)
658
+ return gr.Audio(recording=False)
659
 
660
  def chunk_document(text, chunk_size=1024, overlap=100):
661
  """Split document into overlapping chunks for RAG"""
 
814
  # Home Tab
815
  with gr.TabItem("🏠 Home", id="home"):
816
  embedding_status = "βœ… RAG ready" if embedding_model else "❌ RAG not loaded"
817
+ voice_status = "βœ… Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "❌ Voice chat not available"
818
  gr.Markdown(
819
+ "# Scholar Express - Local Gemma 3n Version with Voice\n"
820
+ "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot and voice chat powered by local Gemma 3n.\n"
821
  f"**System:** {model_status}\n"
822
  f"**RAG System:** {embedding_status}\n"
823
+ f"**Voice Chat:** {voice_status}\n"
824
  f"**DOLPHIN:** Local model for PDF processing\n"
825
+ f"**Gemma 3n:** Local model for alt text generation, chat, and voice\n"
826
  f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
827
+ f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}\n\n"
828
+ "**Features:**\n"
829
+ "- πŸ“„ PDF processing with OCR and layout analysis\n"
830
+ "- πŸ’¬ Text-based chat about your documents\n"
831
+ "- πŸŽ™οΈ Voice chat with Gemma 3n (new!)\n"
832
+ "- β™Ώ AI-generated alt text for accessibility"
833
  )
834
 
835
  with gr.Column(elem_classes="upload-container"):
 
903
  "*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
904
  elem_id="chat-notice"
905
  )
906
+
907
+ # Voice Chat Tab
908
+ with gr.TabItem("πŸŽ™οΈ Talk with Gemma", id="voice") as voice_tab:
909
+ voice_status = "βœ… Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "❌ Voice chat not available"
910
+ gr.Markdown(f"## Voice Chat with Gemma 3n\n{voice_status}")
911
+
912
+ if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
913
+ with gr.Row():
914
+ with gr.Column():
915
+ voice_input_audio = gr.Audio(
916
+ label="Speak to Gemma",
917
+ sources=["microphone"],
918
+ type="numpy",
919
+ streaming=True
920
+ )
921
+ with gr.Column():
922
+ voice_output_audio = gr.Audio(
923
+ label="Gemma's Response",
924
+ streaming=True,
925
+ autoplay=True
926
+ )
927
+ voice_chatbot = gr.Chatbot(
928
+ label="Voice Conversation",
929
+ type="messages",
930
+ height=300
931
+ )
932
+
933
+ with gr.Row():
934
+ voice_stop_btn = gr.Button("⏹️ Stop Conversation", variant="stop")
935
+
936
+ gr.Markdown(
937
+ "*Speak naturally to Gemma 3n. The AI will listen to your voice, process your speech, and respond with both text and voice. You can have conversations before or after processing PDFs.*"
938
+ )
939
+
940
+ # Voice state
941
+ voice_state = gr.State(value=VoiceAppState())
942
+ else:
943
+ gr.Markdown(
944
+ "### Voice chat is not available\n"
945
+ "To enable voice chat, please install the required dependencies:\n"
946
+ "```bash\n"
947
+ "pip install librosa pydub onnxruntime\n"
948
+ "```\n"
949
+ "And ensure the voice_chat directory is properly set up."
950
+ )
951
 
952
  # Event handlers
953
  process_btn.click(
 
1035
  lambda: "",
1036
  outputs=[msg_input]
1037
  )
1038
+
1039
+ # Voice chat event handlers
1040
+ if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
1041
+ # Stream processing
1042
+ voice_stream = voice_input_audio.stream(
1043
+ process_voice_audio,
1044
+ [voice_input_audio, voice_state],
1045
+ [voice_input_audio, voice_state],
1046
+ stream_every=0.50,
1047
+ time_limit=30,
1048
+ )
1049
+
1050
+ # Response generation
1051
+ voice_respond = voice_input_audio.stop_recording(
1052
+ generate_voice_response,
1053
+ [voice_state],
1054
+ [voice_output_audio, voice_state]
1055
+ )
1056
+
1057
+ # Update chatbot display
1058
+ voice_respond.then(
1059
+ lambda s: s.conversation,
1060
+ [voice_state],
1061
+ [voice_chatbot]
1062
+ )
1063
+
1064
+ # Restart recording
1065
+ voice_restart = voice_output_audio.stop(
1066
+ start_voice_recording,
1067
+ [voice_state],
1068
+ [voice_input_audio]
1069
+ )
1070
+
1071
+ # Stop conversation
1072
+ voice_stop_btn.click(
1073
+ lambda: (VoiceAppState(stopped=True), gr.Audio(recording=False)),
1074
+ None,
1075
+ [voice_state, voice_input_audio],
1076
+ cancels=[voice_respond, voice_restart]
1077
+ )
1078
 
1079
 
1080
  if __name__ == "__main__":