raksama19 commited on
Commit
c1ea599
·
verified ·
1 Parent(s): c3ca98f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -270
app.py CHANGED
@@ -30,19 +30,6 @@ import io
30
  from utils.utils import *
31
  from utils.markdown_utils import MarkdownConverter
32
 
33
- # Voice functionality imports
34
- import time
35
- import librosa
36
- from dataclasses import dataclass, field
37
- from pydub import AudioSegment
38
- try:
39
- from voice_chat.utils.vad import get_speech_timestamps, collect_chunks, VadOptions
40
- from voice_chat.gemma3n_inference import Gemma3nInference
41
- VOICE_DEPENDENCIES_AVAILABLE = True
42
- except ImportError as e:
43
- print(f"Voice dependencies not available: {e}")
44
- VOICE_DEPENDENCIES_AVAILABLE = False
45
-
46
  # Math extension is optional for enhanced math rendering
47
  MATH_EXTENSION_AVAILABLE = False
48
  try:
@@ -51,15 +38,6 @@ try:
51
  except ImportError:
52
  pass
53
 
54
- # Warm up voice model if available
55
- if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
56
- try:
57
- print("Warming up voice model...")
58
- voice_model.warm_up()
59
- print("✅ Voice model warmed up successfully")
60
- except Exception as e:
61
- print(f"⚠️ Voice model warm-up failed: {e}")
62
-
63
 
64
  class DOLPHIN:
65
  def __init__(self, model_id_or_path):
@@ -509,158 +487,6 @@ show_results_tab = False
509
  document_chunks = []
510
  document_embeddings = None
511
 
512
- # Voice chat parameters and state
513
- IN_CHANNELS = 1
514
- IN_RATE = 24000
515
- IN_CHUNK = 1024
516
- IN_SAMPLE_WIDTH = 2
517
- VAD_STRIDE = 0.5
518
- OUT_CHANNELS = 1
519
- OUT_RATE = 24000
520
- OUT_SAMPLE_WIDTH = 2
521
- OUT_CHUNK = 20 * 4096
522
-
523
- # Initialize voice inference model if available
524
- voice_model = None
525
- if VOICE_DEPENDENCIES_AVAILABLE:
526
- try:
527
- print("Loading voice model for Talk with Gemma...")
528
- voice_model = Gemma3nInference(device='cuda' if torch.cuda.is_available() else 'cpu')
529
- print("✅ Voice model loaded successfully")
530
- except Exception as e:
531
- print(f"❌ Error loading voice model: {e}")
532
- VOICE_DEPENDENCIES_AVAILABLE = False
533
-
534
- @dataclass
535
- class VoiceAppState:
536
- stream: np.ndarray | None = None
537
- sampling_rate: int = 0
538
- pause_detected: bool = False
539
- started_talking: bool = False
540
- stopped: bool = False
541
- conversation: list = field(default_factory=list)
542
-
543
-
544
- # Voice functionality
545
- def run_vad(ori_audio, sr):
546
- """Voice Activity Detection"""
547
- _st = time.time()
548
- try:
549
- audio = ori_audio
550
- if isinstance(audio, bytes):
551
- audio = np.frombuffer(audio, dtype=np.int16)
552
- audio = audio.astype(np.float32) / 32768.0
553
- sampling_rate = 16000
554
- if sr != sampling_rate:
555
- audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
556
-
557
- vad_parameters = {}
558
- vad_parameters = VadOptions(**vad_parameters)
559
- speech_chunks = get_speech_timestamps(audio, vad_parameters)
560
- audio = collect_chunks(audio, speech_chunks)
561
- duration_after_vad = audio.shape[0] / sampling_rate
562
-
563
- if sr != sampling_rate:
564
- vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
565
- else:
566
- vad_audio = audio
567
- vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
568
- vad_audio_bytes = vad_audio.tobytes()
569
-
570
- return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
571
- except Exception as e:
572
- msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {e}"
573
- print(msg)
574
- return -1, ori_audio, round(time.time() - _st, 4)
575
-
576
- def determine_pause(audio: np.ndarray, sampling_rate: int, state: VoiceAppState) -> bool:
577
- """Determine if a pause happened in the audio stream"""
578
- temp_audio = audio
579
- dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
580
- duration = len(audio) / sampling_rate
581
-
582
- if dur_vad > 0.5 and not state.started_talking:
583
- print("started talking")
584
- state.started_talking = True
585
- return False
586
-
587
- print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
588
- return (duration - dur_vad) > 1
589
-
590
- def process_voice_audio(audio: tuple, state: VoiceAppState):
591
- """Process streaming audio input"""
592
- if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
593
- return None, state
594
-
595
- if state.stream is None:
596
- state.stream = audio[1]
597
- state.sampling_rate = audio[0]
598
- else:
599
- state.stream = np.concatenate((state.stream, audio[1]))
600
-
601
- pause_detected = determine_pause(state.stream, state.sampling_rate, state)
602
- state.pause_detected = pause_detected
603
-
604
- if state.pause_detected and state.started_talking:
605
- return gr.Audio(recording=False), state
606
- return None, state
607
-
608
- def generate_voice_response(state: VoiceAppState):
609
- """Generate voice response from audio input"""
610
- if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
611
- return None, VoiceAppState()
612
-
613
- if not state.pause_detected and not state.started_talking:
614
- return None, VoiceAppState()
615
-
616
- try:
617
- audio_buffer = io.BytesIO()
618
- segment = AudioSegment(
619
- state.stream.tobytes(),
620
- frame_rate=state.sampling_rate,
621
- sample_width=state.stream.dtype.itemsize,
622
- channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
623
- )
624
- segment.export(audio_buffer, format="wav")
625
-
626
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
627
- f.write(audio_buffer.getvalue())
628
- temp_audio_path = f.name
629
-
630
- try:
631
- # Generate text response from audio
632
- text_response = voice_model.generate_response(temp_audio_path)
633
- print(f"Generated voice response: {text_response}")
634
-
635
- # Convert text to speech
636
- audio_response = voice_model.text_to_speech_simple(text_response)
637
-
638
- # Convert to format expected by Gradio
639
- audio_segment = AudioSegment.from_file(io.BytesIO(audio_response), format="wav")
640
- audio_array = np.array(audio_segment.get_array_of_samples())
641
-
642
- if audio_segment.channels == 2:
643
- audio_array = audio_array.reshape((-1, 2))
644
-
645
- # Update conversation history
646
- state.conversation.append({"role": "user", "content": {"path": temp_audio_path, "mime_type": "audio/wav"}})
647
- state.conversation.append({"role": "assistant", "content": {"text": text_response}})
648
-
649
- return (audio_segment.frame_rate, audio_array), VoiceAppState(conversation=state.conversation)
650
-
651
- finally:
652
- if os.path.exists(temp_audio_path):
653
- os.unlink(temp_audio_path)
654
-
655
- except Exception as e:
656
- print(f"Error generating voice response: {e}")
657
- return None, VoiceAppState()
658
-
659
- def start_voice_recording(state: VoiceAppState):
660
- """Start recording user voice input"""
661
- if not state.stopped:
662
- return gr.Audio(recording=True)
663
- return gr.Audio(recording=False)
664
 
665
  def chunk_document(text, chunk_size=1024, overlap=100):
666
  """Split document into overlapping chunks for RAG"""
@@ -819,22 +645,15 @@ with gr.Blocks(
819
  # Home Tab
820
  with gr.TabItem("🏠 Home", id="home"):
821
  embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
822
- voice_status = "✅ Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "❌ Voice chat not available"
823
  gr.Markdown(
824
- "# Scholar Express - Local Gemma 3n Version with Voice\n"
825
- "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot and voice chat powered by local Gemma 3n.\n"
826
  f"**System:** {model_status}\n"
827
  f"**RAG System:** {embedding_status}\n"
828
- f"**Voice Chat:** {voice_status}\n"
829
  f"**DOLPHIN:** Local model for PDF processing\n"
830
- f"**Gemma 3n:** Local model for alt text generation, chat, and voice\n"
831
  f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
832
- f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}\n\n"
833
- "**Features:**\n"
834
- "- 📄 PDF processing with OCR and layout analysis\n"
835
- "- 💬 Text-based chat about your documents\n"
836
- "- 🎙️ Voice chat with Gemma 3n (new!)\n"
837
- "- ♿ AI-generated alt text for accessibility"
838
  )
839
 
840
  with gr.Column(elem_classes="upload-container"):
@@ -908,51 +727,6 @@ with gr.Blocks(
908
  "*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
909
  elem_id="chat-notice"
910
  )
911
-
912
- # Voice Chat Tab
913
- with gr.TabItem("🎙️ Talk with Gemma", id="voice") as voice_tab:
914
- voice_status = "✅ Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "❌ Voice chat not available"
915
- gr.Markdown(f"## Voice Chat with Gemma 3n\n{voice_status}")
916
-
917
- if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
918
- with gr.Row():
919
- with gr.Column():
920
- voice_input_audio = gr.Audio(
921
- label="Speak to Gemma",
922
- sources=["microphone"],
923
- type="numpy",
924
- streaming=True
925
- )
926
- with gr.Column():
927
- voice_output_audio = gr.Audio(
928
- label="Gemma's Response",
929
- streaming=True,
930
- autoplay=True
931
- )
932
- voice_chatbot = gr.Chatbot(
933
- label="Voice Conversation",
934
- type="messages",
935
- height=300
936
- )
937
-
938
- with gr.Row():
939
- voice_stop_btn = gr.Button("⏹️ Stop Conversation", variant="stop")
940
-
941
- gr.Markdown(
942
- "*Speak naturally to Gemma 3n. The AI will listen to your voice, process your speech, and respond with both text and voice. You can have conversations before or after processing PDFs.*"
943
- )
944
-
945
- # Voice state
946
- voice_state = gr.State(value=VoiceAppState())
947
- else:
948
- gr.Markdown(
949
- "### Voice chat is not available\n"
950
- "To enable voice chat, please install the required dependencies:\n"
951
- "```bash\n"
952
- "pip install librosa pydub onnxruntime\n"
953
- "```\n"
954
- "And ensure the voice_chat directory is properly set up."
955
- )
956
 
957
  # Event handlers
958
  process_btn.click(
@@ -1040,46 +814,6 @@ Please provide a clear and helpful answer based on the context provided."""
1040
  lambda: "",
1041
  outputs=[msg_input]
1042
  )
1043
-
1044
- # Voice chat event handlers
1045
- if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
1046
- # Stream processing
1047
- voice_stream = voice_input_audio.stream(
1048
- process_voice_audio,
1049
- [voice_input_audio, voice_state],
1050
- [voice_input_audio, voice_state],
1051
- stream_every=0.50,
1052
- time_limit=30,
1053
- )
1054
-
1055
- # Response generation
1056
- voice_respond = voice_input_audio.stop_recording(
1057
- generate_voice_response,
1058
- [voice_state],
1059
- [voice_output_audio, voice_state]
1060
- )
1061
-
1062
- # Update chatbot display
1063
- voice_respond.then(
1064
- lambda s: s.conversation,
1065
- [voice_state],
1066
- [voice_chatbot]
1067
- )
1068
-
1069
- # Restart recording
1070
- voice_restart = voice_output_audio.stop(
1071
- start_voice_recording,
1072
- [voice_state],
1073
- [voice_input_audio]
1074
- )
1075
-
1076
- # Stop conversation
1077
- voice_stop_btn.click(
1078
- lambda: (VoiceAppState(stopped=True), gr.Audio(recording=False)),
1079
- None,
1080
- [voice_state, voice_input_audio],
1081
- cancels=[voice_respond, voice_restart]
1082
- )
1083
 
1084
 
1085
  if __name__ == "__main__":
 
30
  from utils.utils import *
31
  from utils.markdown_utils import MarkdownConverter
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Math extension is optional for enhanced math rendering
34
  MATH_EXTENSION_AVAILABLE = False
35
  try:
 
38
  except ImportError:
39
  pass
40
 
 
 
 
 
 
 
 
 
 
41
 
42
  class DOLPHIN:
43
  def __init__(self, model_id_or_path):
 
487
  document_chunks = []
488
  document_embeddings = None
489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
 
491
  def chunk_document(text, chunk_size=1024, overlap=100):
492
  """Split document into overlapping chunks for RAG"""
 
645
  # Home Tab
646
  with gr.TabItem("🏠 Home", id="home"):
647
  embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
 
648
  gr.Markdown(
649
+ "# Scholar Express - Local Gemma 3n Version\n"
650
+ "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by local Gemma 3n.\n"
651
  f"**System:** {model_status}\n"
652
  f"**RAG System:** {embedding_status}\n"
 
653
  f"**DOLPHIN:** Local model for PDF processing\n"
654
+ f"**Gemma 3n:** Local model for alt text generation and chat\n"
655
  f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
656
+ f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}"
 
 
 
 
 
657
  )
658
 
659
  with gr.Column(elem_classes="upload-container"):
 
727
  "*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
728
  elem_id="chat-notice"
729
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
 
731
  # Event handlers
732
  process_btn.click(
 
814
  lambda: "",
815
  outputs=[msg_input]
816
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
 
818
 
819
  if __name__ == "__main__":