Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -30,19 +30,6 @@ import io
|
|
30 |
from utils.utils import *
|
31 |
from utils.markdown_utils import MarkdownConverter
|
32 |
|
33 |
-
# Voice functionality imports
|
34 |
-
import time
|
35 |
-
import librosa
|
36 |
-
from dataclasses import dataclass, field
|
37 |
-
from pydub import AudioSegment
|
38 |
-
try:
|
39 |
-
from voice_chat.utils.vad import get_speech_timestamps, collect_chunks, VadOptions
|
40 |
-
from voice_chat.gemma3n_inference import Gemma3nInference
|
41 |
-
VOICE_DEPENDENCIES_AVAILABLE = True
|
42 |
-
except ImportError as e:
|
43 |
-
print(f"Voice dependencies not available: {e}")
|
44 |
-
VOICE_DEPENDENCIES_AVAILABLE = False
|
45 |
-
|
46 |
# Math extension is optional for enhanced math rendering
|
47 |
MATH_EXTENSION_AVAILABLE = False
|
48 |
try:
|
@@ -51,15 +38,6 @@ try:
|
|
51 |
except ImportError:
|
52 |
pass
|
53 |
|
54 |
-
# Warm up voice model if available
|
55 |
-
if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
|
56 |
-
try:
|
57 |
-
print("Warming up voice model...")
|
58 |
-
voice_model.warm_up()
|
59 |
-
print("✅ Voice model warmed up successfully")
|
60 |
-
except Exception as e:
|
61 |
-
print(f"⚠️ Voice model warm-up failed: {e}")
|
62 |
-
|
63 |
|
64 |
class DOLPHIN:
|
65 |
def __init__(self, model_id_or_path):
|
@@ -509,158 +487,6 @@ show_results_tab = False
|
|
509 |
document_chunks = []
|
510 |
document_embeddings = None
|
511 |
|
512 |
-
# Voice chat parameters and state
|
513 |
-
IN_CHANNELS = 1
|
514 |
-
IN_RATE = 24000
|
515 |
-
IN_CHUNK = 1024
|
516 |
-
IN_SAMPLE_WIDTH = 2
|
517 |
-
VAD_STRIDE = 0.5
|
518 |
-
OUT_CHANNELS = 1
|
519 |
-
OUT_RATE = 24000
|
520 |
-
OUT_SAMPLE_WIDTH = 2
|
521 |
-
OUT_CHUNK = 20 * 4096
|
522 |
-
|
523 |
-
# Initialize voice inference model if available
|
524 |
-
voice_model = None
|
525 |
-
if VOICE_DEPENDENCIES_AVAILABLE:
|
526 |
-
try:
|
527 |
-
print("Loading voice model for Talk with Gemma...")
|
528 |
-
voice_model = Gemma3nInference(device='cuda' if torch.cuda.is_available() else 'cpu')
|
529 |
-
print("✅ Voice model loaded successfully")
|
530 |
-
except Exception as e:
|
531 |
-
print(f"❌ Error loading voice model: {e}")
|
532 |
-
VOICE_DEPENDENCIES_AVAILABLE = False
|
533 |
-
|
534 |
-
@dataclass
|
535 |
-
class VoiceAppState:
|
536 |
-
stream: np.ndarray | None = None
|
537 |
-
sampling_rate: int = 0
|
538 |
-
pause_detected: bool = False
|
539 |
-
started_talking: bool = False
|
540 |
-
stopped: bool = False
|
541 |
-
conversation: list = field(default_factory=list)
|
542 |
-
|
543 |
-
|
544 |
-
# Voice functionality
|
545 |
-
def run_vad(ori_audio, sr):
|
546 |
-
"""Voice Activity Detection"""
|
547 |
-
_st = time.time()
|
548 |
-
try:
|
549 |
-
audio = ori_audio
|
550 |
-
if isinstance(audio, bytes):
|
551 |
-
audio = np.frombuffer(audio, dtype=np.int16)
|
552 |
-
audio = audio.astype(np.float32) / 32768.0
|
553 |
-
sampling_rate = 16000
|
554 |
-
if sr != sampling_rate:
|
555 |
-
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
|
556 |
-
|
557 |
-
vad_parameters = {}
|
558 |
-
vad_parameters = VadOptions(**vad_parameters)
|
559 |
-
speech_chunks = get_speech_timestamps(audio, vad_parameters)
|
560 |
-
audio = collect_chunks(audio, speech_chunks)
|
561 |
-
duration_after_vad = audio.shape[0] / sampling_rate
|
562 |
-
|
563 |
-
if sr != sampling_rate:
|
564 |
-
vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
|
565 |
-
else:
|
566 |
-
vad_audio = audio
|
567 |
-
vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
|
568 |
-
vad_audio_bytes = vad_audio.tobytes()
|
569 |
-
|
570 |
-
return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
|
571 |
-
except Exception as e:
|
572 |
-
msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {e}"
|
573 |
-
print(msg)
|
574 |
-
return -1, ori_audio, round(time.time() - _st, 4)
|
575 |
-
|
576 |
-
def determine_pause(audio: np.ndarray, sampling_rate: int, state: VoiceAppState) -> bool:
|
577 |
-
"""Determine if a pause happened in the audio stream"""
|
578 |
-
temp_audio = audio
|
579 |
-
dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
|
580 |
-
duration = len(audio) / sampling_rate
|
581 |
-
|
582 |
-
if dur_vad > 0.5 and not state.started_talking:
|
583 |
-
print("started talking")
|
584 |
-
state.started_talking = True
|
585 |
-
return False
|
586 |
-
|
587 |
-
print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
|
588 |
-
return (duration - dur_vad) > 1
|
589 |
-
|
590 |
-
def process_voice_audio(audio: tuple, state: VoiceAppState):
|
591 |
-
"""Process streaming audio input"""
|
592 |
-
if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
|
593 |
-
return None, state
|
594 |
-
|
595 |
-
if state.stream is None:
|
596 |
-
state.stream = audio[1]
|
597 |
-
state.sampling_rate = audio[0]
|
598 |
-
else:
|
599 |
-
state.stream = np.concatenate((state.stream, audio[1]))
|
600 |
-
|
601 |
-
pause_detected = determine_pause(state.stream, state.sampling_rate, state)
|
602 |
-
state.pause_detected = pause_detected
|
603 |
-
|
604 |
-
if state.pause_detected and state.started_talking:
|
605 |
-
return gr.Audio(recording=False), state
|
606 |
-
return None, state
|
607 |
-
|
608 |
-
def generate_voice_response(state: VoiceAppState):
|
609 |
-
"""Generate voice response from audio input"""
|
610 |
-
if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
|
611 |
-
return None, VoiceAppState()
|
612 |
-
|
613 |
-
if not state.pause_detected and not state.started_talking:
|
614 |
-
return None, VoiceAppState()
|
615 |
-
|
616 |
-
try:
|
617 |
-
audio_buffer = io.BytesIO()
|
618 |
-
segment = AudioSegment(
|
619 |
-
state.stream.tobytes(),
|
620 |
-
frame_rate=state.sampling_rate,
|
621 |
-
sample_width=state.stream.dtype.itemsize,
|
622 |
-
channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
|
623 |
-
)
|
624 |
-
segment.export(audio_buffer, format="wav")
|
625 |
-
|
626 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
627 |
-
f.write(audio_buffer.getvalue())
|
628 |
-
temp_audio_path = f.name
|
629 |
-
|
630 |
-
try:
|
631 |
-
# Generate text response from audio
|
632 |
-
text_response = voice_model.generate_response(temp_audio_path)
|
633 |
-
print(f"Generated voice response: {text_response}")
|
634 |
-
|
635 |
-
# Convert text to speech
|
636 |
-
audio_response = voice_model.text_to_speech_simple(text_response)
|
637 |
-
|
638 |
-
# Convert to format expected by Gradio
|
639 |
-
audio_segment = AudioSegment.from_file(io.BytesIO(audio_response), format="wav")
|
640 |
-
audio_array = np.array(audio_segment.get_array_of_samples())
|
641 |
-
|
642 |
-
if audio_segment.channels == 2:
|
643 |
-
audio_array = audio_array.reshape((-1, 2))
|
644 |
-
|
645 |
-
# Update conversation history
|
646 |
-
state.conversation.append({"role": "user", "content": {"path": temp_audio_path, "mime_type": "audio/wav"}})
|
647 |
-
state.conversation.append({"role": "assistant", "content": {"text": text_response}})
|
648 |
-
|
649 |
-
return (audio_segment.frame_rate, audio_array), VoiceAppState(conversation=state.conversation)
|
650 |
-
|
651 |
-
finally:
|
652 |
-
if os.path.exists(temp_audio_path):
|
653 |
-
os.unlink(temp_audio_path)
|
654 |
-
|
655 |
-
except Exception as e:
|
656 |
-
print(f"Error generating voice response: {e}")
|
657 |
-
return None, VoiceAppState()
|
658 |
-
|
659 |
-
def start_voice_recording(state: VoiceAppState):
|
660 |
-
"""Start recording user voice input"""
|
661 |
-
if not state.stopped:
|
662 |
-
return gr.Audio(recording=True)
|
663 |
-
return gr.Audio(recording=False)
|
664 |
|
665 |
def chunk_document(text, chunk_size=1024, overlap=100):
|
666 |
"""Split document into overlapping chunks for RAG"""
|
@@ -819,22 +645,15 @@ with gr.Blocks(
|
|
819 |
# Home Tab
|
820 |
with gr.TabItem("🏠 Home", id="home"):
|
821 |
embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
|
822 |
-
voice_status = "✅ Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "❌ Voice chat not available"
|
823 |
gr.Markdown(
|
824 |
-
"# Scholar Express - Local Gemma 3n Version
|
825 |
-
"### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot
|
826 |
f"**System:** {model_status}\n"
|
827 |
f"**RAG System:** {embedding_status}\n"
|
828 |
-
f"**Voice Chat:** {voice_status}\n"
|
829 |
f"**DOLPHIN:** Local model for PDF processing\n"
|
830 |
-
f"**Gemma 3n:** Local model for alt text generation
|
831 |
f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
|
832 |
-
f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}
|
833 |
-
"**Features:**\n"
|
834 |
-
"- 📄 PDF processing with OCR and layout analysis\n"
|
835 |
-
"- 💬 Text-based chat about your documents\n"
|
836 |
-
"- 🎙️ Voice chat with Gemma 3n (new!)\n"
|
837 |
-
"- ♿ AI-generated alt text for accessibility"
|
838 |
)
|
839 |
|
840 |
with gr.Column(elem_classes="upload-container"):
|
@@ -908,51 +727,6 @@ with gr.Blocks(
|
|
908 |
"*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
|
909 |
elem_id="chat-notice"
|
910 |
)
|
911 |
-
|
912 |
-
# Voice Chat Tab
|
913 |
-
with gr.TabItem("🎙️ Talk with Gemma", id="voice") as voice_tab:
|
914 |
-
voice_status = "✅ Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "❌ Voice chat not available"
|
915 |
-
gr.Markdown(f"## Voice Chat with Gemma 3n\n{voice_status}")
|
916 |
-
|
917 |
-
if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
|
918 |
-
with gr.Row():
|
919 |
-
with gr.Column():
|
920 |
-
voice_input_audio = gr.Audio(
|
921 |
-
label="Speak to Gemma",
|
922 |
-
sources=["microphone"],
|
923 |
-
type="numpy",
|
924 |
-
streaming=True
|
925 |
-
)
|
926 |
-
with gr.Column():
|
927 |
-
voice_output_audio = gr.Audio(
|
928 |
-
label="Gemma's Response",
|
929 |
-
streaming=True,
|
930 |
-
autoplay=True
|
931 |
-
)
|
932 |
-
voice_chatbot = gr.Chatbot(
|
933 |
-
label="Voice Conversation",
|
934 |
-
type="messages",
|
935 |
-
height=300
|
936 |
-
)
|
937 |
-
|
938 |
-
with gr.Row():
|
939 |
-
voice_stop_btn = gr.Button("⏹️ Stop Conversation", variant="stop")
|
940 |
-
|
941 |
-
gr.Markdown(
|
942 |
-
"*Speak naturally to Gemma 3n. The AI will listen to your voice, process your speech, and respond with both text and voice. You can have conversations before or after processing PDFs.*"
|
943 |
-
)
|
944 |
-
|
945 |
-
# Voice state
|
946 |
-
voice_state = gr.State(value=VoiceAppState())
|
947 |
-
else:
|
948 |
-
gr.Markdown(
|
949 |
-
"### Voice chat is not available\n"
|
950 |
-
"To enable voice chat, please install the required dependencies:\n"
|
951 |
-
"```bash\n"
|
952 |
-
"pip install librosa pydub onnxruntime\n"
|
953 |
-
"```\n"
|
954 |
-
"And ensure the voice_chat directory is properly set up."
|
955 |
-
)
|
956 |
|
957 |
# Event handlers
|
958 |
process_btn.click(
|
@@ -1040,46 +814,6 @@ Please provide a clear and helpful answer based on the context provided."""
|
|
1040 |
lambda: "",
|
1041 |
outputs=[msg_input]
|
1042 |
)
|
1043 |
-
|
1044 |
-
# Voice chat event handlers
|
1045 |
-
if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
|
1046 |
-
# Stream processing
|
1047 |
-
voice_stream = voice_input_audio.stream(
|
1048 |
-
process_voice_audio,
|
1049 |
-
[voice_input_audio, voice_state],
|
1050 |
-
[voice_input_audio, voice_state],
|
1051 |
-
stream_every=0.50,
|
1052 |
-
time_limit=30,
|
1053 |
-
)
|
1054 |
-
|
1055 |
-
# Response generation
|
1056 |
-
voice_respond = voice_input_audio.stop_recording(
|
1057 |
-
generate_voice_response,
|
1058 |
-
[voice_state],
|
1059 |
-
[voice_output_audio, voice_state]
|
1060 |
-
)
|
1061 |
-
|
1062 |
-
# Update chatbot display
|
1063 |
-
voice_respond.then(
|
1064 |
-
lambda s: s.conversation,
|
1065 |
-
[voice_state],
|
1066 |
-
[voice_chatbot]
|
1067 |
-
)
|
1068 |
-
|
1069 |
-
# Restart recording
|
1070 |
-
voice_restart = voice_output_audio.stop(
|
1071 |
-
start_voice_recording,
|
1072 |
-
[voice_state],
|
1073 |
-
[voice_input_audio]
|
1074 |
-
)
|
1075 |
-
|
1076 |
-
# Stop conversation
|
1077 |
-
voice_stop_btn.click(
|
1078 |
-
lambda: (VoiceAppState(stopped=True), gr.Audio(recording=False)),
|
1079 |
-
None,
|
1080 |
-
[voice_state, voice_input_audio],
|
1081 |
-
cancels=[voice_respond, voice_restart]
|
1082 |
-
)
|
1083 |
|
1084 |
|
1085 |
if __name__ == "__main__":
|
|
|
30 |
from utils.utils import *
|
31 |
from utils.markdown_utils import MarkdownConverter
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Math extension is optional for enhanced math rendering
|
34 |
MATH_EXTENSION_AVAILABLE = False
|
35 |
try:
|
|
|
38 |
except ImportError:
|
39 |
pass
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
class DOLPHIN:
|
43 |
def __init__(self, model_id_or_path):
|
|
|
487 |
document_chunks = []
|
488 |
document_embeddings = None
|
489 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
|
491 |
def chunk_document(text, chunk_size=1024, overlap=100):
|
492 |
"""Split document into overlapping chunks for RAG"""
|
|
|
645 |
# Home Tab
|
646 |
with gr.TabItem("🏠 Home", id="home"):
|
647 |
embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
|
|
|
648 |
gr.Markdown(
|
649 |
+
"# Scholar Express - Local Gemma 3n Version\n"
|
650 |
+
"### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by local Gemma 3n.\n"
|
651 |
f"**System:** {model_status}\n"
|
652 |
f"**RAG System:** {embedding_status}\n"
|
|
|
653 |
f"**DOLPHIN:** Local model for PDF processing\n"
|
654 |
+
f"**Gemma 3n:** Local model for alt text generation and chat\n"
|
655 |
f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
|
656 |
+
f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}"
|
|
|
|
|
|
|
|
|
|
|
657 |
)
|
658 |
|
659 |
with gr.Column(elem_classes="upload-container"):
|
|
|
727 |
"*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
|
728 |
elem_id="chat-notice"
|
729 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
730 |
|
731 |
# Event handlers
|
732 |
process_btn.click(
|
|
|
814 |
lambda: "",
|
815 |
outputs=[msg_input]
|
816 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
817 |
|
818 |
|
819 |
if __name__ == "__main__":
|