Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -30,6 +30,19 @@ import io
|
|
30 |
from utils.utils import *
|
31 |
from utils.markdown_utils import MarkdownConverter
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Math extension is optional for enhanced math rendering
|
34 |
MATH_EXTENSION_AVAILABLE = False
|
35 |
try:
|
@@ -38,6 +51,19 @@ try:
|
|
38 |
except ImportError:
|
39 |
pass
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
class DOLPHIN:
|
43 |
def __init__(self, model_id_or_path):
|
@@ -487,6 +513,149 @@ show_results_tab = False
|
|
487 |
document_chunks = []
|
488 |
document_embeddings = None
|
489 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
|
491 |
def chunk_document(text, chunk_size=1024, overlap=100):
|
492 |
"""Split document into overlapping chunks for RAG"""
|
@@ -645,15 +814,22 @@ with gr.Blocks(
|
|
645 |
# Home Tab
|
646 |
with gr.TabItem("π Home", id="home"):
|
647 |
embedding_status = "β
RAG ready" if embedding_model else "β RAG not loaded"
|
|
|
648 |
gr.Markdown(
|
649 |
-
"# Scholar Express - Local Gemma 3n Version\n"
|
650 |
-
"### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by local Gemma 3n.\n"
|
651 |
f"**System:** {model_status}\n"
|
652 |
f"**RAG System:** {embedding_status}\n"
|
|
|
653 |
f"**DOLPHIN:** Local model for PDF processing\n"
|
654 |
-
f"**Gemma 3n:** Local model for alt text generation and
|
655 |
f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
|
656 |
-
f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}"
|
|
|
|
|
|
|
|
|
|
|
657 |
)
|
658 |
|
659 |
with gr.Column(elem_classes="upload-container"):
|
@@ -727,6 +903,51 @@ with gr.Blocks(
|
|
727 |
"*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
|
728 |
elem_id="chat-notice"
|
729 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
730 |
|
731 |
# Event handlers
|
732 |
process_btn.click(
|
@@ -814,6 +1035,46 @@ Please provide a clear and helpful answer based on the context provided."""
|
|
814 |
lambda: "",
|
815 |
outputs=[msg_input]
|
816 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
817 |
|
818 |
|
819 |
if __name__ == "__main__":
|
|
|
30 |
from utils.utils import *
|
31 |
from utils.markdown_utils import MarkdownConverter
|
32 |
|
33 |
+
# Voice functionality imports
|
34 |
+
import time
|
35 |
+
import librosa
|
36 |
+
from dataclasses import dataclass, field
|
37 |
+
from pydub import AudioSegment
|
38 |
+
try:
|
39 |
+
from voice_chat.utils.vad import get_speech_timestamps, collect_chunks, VadOptions
|
40 |
+
from voice_chat.gemma3n_inference import Gemma3nInference
|
41 |
+
VOICE_DEPENDENCIES_AVAILABLE = True
|
42 |
+
except ImportError as e:
|
43 |
+
print(f"Voice dependencies not available: {e}")
|
44 |
+
VOICE_DEPENDENCIES_AVAILABLE = False
|
45 |
+
|
46 |
# Math extension is optional for enhanced math rendering
|
47 |
MATH_EXTENSION_AVAILABLE = False
|
48 |
try:
|
|
|
51 |
except ImportError:
|
52 |
pass
|
53 |
|
54 |
+
# Initialize voice model early to avoid NameError
|
55 |
+
voice_model = None
|
56 |
+
if VOICE_DEPENDENCIES_AVAILABLE:
|
57 |
+
try:
|
58 |
+
print("Loading voice model...")
|
59 |
+
voice_model = Gemma3nInference(device='cuda' if torch.cuda.is_available() else 'cpu')
|
60 |
+
print("Warming up voice model...")
|
61 |
+
voice_model.warm_up()
|
62 |
+
print("β
Voice model loaded and warmed up successfully")
|
63 |
+
except Exception as e:
|
64 |
+
print(f"β οΈ Voice model initialization failed: {e}")
|
65 |
+
voice_model = None
|
66 |
+
|
67 |
|
68 |
class DOLPHIN:
|
69 |
def __init__(self, model_id_or_path):
|
|
|
513 |
document_chunks = []
|
514 |
document_embeddings = None
|
515 |
|
516 |
+
# Voice chat parameters and state
|
517 |
+
IN_CHANNELS = 1
|
518 |
+
IN_RATE = 24000
|
519 |
+
IN_CHUNK = 1024
|
520 |
+
IN_SAMPLE_WIDTH = 2
|
521 |
+
VAD_STRIDE = 0.5
|
522 |
+
OUT_CHANNELS = 1
|
523 |
+
OUT_RATE = 24000
|
524 |
+
OUT_SAMPLE_WIDTH = 2
|
525 |
+
OUT_CHUNK = 20 * 4096
|
526 |
+
|
527 |
+
# Voice model already initialized earlier in the file
|
528 |
+
|
529 |
+
@dataclass
|
530 |
+
class VoiceAppState:
|
531 |
+
stream: np.ndarray | None = None
|
532 |
+
sampling_rate: int = 0
|
533 |
+
pause_detected: bool = False
|
534 |
+
started_talking: bool = False
|
535 |
+
stopped: bool = False
|
536 |
+
conversation: list = field(default_factory=list)
|
537 |
+
|
538 |
+
|
539 |
+
# Voice functionality
|
540 |
+
def run_vad(ori_audio, sr):
|
541 |
+
"""Voice Activity Detection"""
|
542 |
+
_st = time.time()
|
543 |
+
try:
|
544 |
+
audio = ori_audio
|
545 |
+
if isinstance(audio, bytes):
|
546 |
+
audio = np.frombuffer(audio, dtype=np.int16)
|
547 |
+
audio = audio.astype(np.float32) / 32768.0
|
548 |
+
sampling_rate = 16000
|
549 |
+
if sr != sampling_rate:
|
550 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
|
551 |
+
|
552 |
+
vad_parameters = {}
|
553 |
+
vad_parameters = VadOptions(**vad_parameters)
|
554 |
+
speech_chunks = get_speech_timestamps(audio, vad_parameters)
|
555 |
+
audio = collect_chunks(audio, speech_chunks)
|
556 |
+
duration_after_vad = audio.shape[0] / sampling_rate
|
557 |
+
|
558 |
+
if sr != sampling_rate:
|
559 |
+
vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
|
560 |
+
else:
|
561 |
+
vad_audio = audio
|
562 |
+
vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
|
563 |
+
vad_audio_bytes = vad_audio.tobytes()
|
564 |
+
|
565 |
+
return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
|
566 |
+
except Exception as e:
|
567 |
+
msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {e}"
|
568 |
+
print(msg)
|
569 |
+
return -1, ori_audio, round(time.time() - _st, 4)
|
570 |
+
|
571 |
+
def determine_pause(audio: np.ndarray, sampling_rate: int, state: VoiceAppState) -> bool:
|
572 |
+
"""Determine if a pause happened in the audio stream"""
|
573 |
+
temp_audio = audio
|
574 |
+
dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
|
575 |
+
duration = len(audio) / sampling_rate
|
576 |
+
|
577 |
+
if dur_vad > 0.5 and not state.started_talking:
|
578 |
+
print("started talking")
|
579 |
+
state.started_talking = True
|
580 |
+
return False
|
581 |
+
|
582 |
+
print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
|
583 |
+
return (duration - dur_vad) > 1
|
584 |
+
|
585 |
+
def process_voice_audio(audio: tuple, state: VoiceAppState):
|
586 |
+
"""Process streaming audio input"""
|
587 |
+
if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
|
588 |
+
return None, state
|
589 |
+
|
590 |
+
if state.stream is None:
|
591 |
+
state.stream = audio[1]
|
592 |
+
state.sampling_rate = audio[0]
|
593 |
+
else:
|
594 |
+
state.stream = np.concatenate((state.stream, audio[1]))
|
595 |
+
|
596 |
+
pause_detected = determine_pause(state.stream, state.sampling_rate, state)
|
597 |
+
state.pause_detected = pause_detected
|
598 |
+
|
599 |
+
if state.pause_detected and state.started_talking:
|
600 |
+
return gr.Audio(recording=False), state
|
601 |
+
return None, state
|
602 |
+
|
603 |
+
def generate_voice_response(state: VoiceAppState):
|
604 |
+
"""Generate voice response from audio input"""
|
605 |
+
if not VOICE_DEPENDENCIES_AVAILABLE or voice_model is None:
|
606 |
+
return None, VoiceAppState()
|
607 |
+
|
608 |
+
if not state.pause_detected and not state.started_talking:
|
609 |
+
return None, VoiceAppState()
|
610 |
+
|
611 |
+
try:
|
612 |
+
audio_buffer = io.BytesIO()
|
613 |
+
segment = AudioSegment(
|
614 |
+
state.stream.tobytes(),
|
615 |
+
frame_rate=state.sampling_rate,
|
616 |
+
sample_width=state.stream.dtype.itemsize,
|
617 |
+
channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
|
618 |
+
)
|
619 |
+
segment.export(audio_buffer, format="wav")
|
620 |
+
|
621 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
622 |
+
f.write(audio_buffer.getvalue())
|
623 |
+
temp_audio_path = f.name
|
624 |
+
|
625 |
+
try:
|
626 |
+
# Generate text response from audio
|
627 |
+
text_response = voice_model.generate_response(temp_audio_path)
|
628 |
+
print(f"Generated voice response: {text_response}")
|
629 |
+
|
630 |
+
# Convert text to speech
|
631 |
+
audio_response = voice_model.text_to_speech_simple(text_response)
|
632 |
+
|
633 |
+
# Convert to format expected by Gradio
|
634 |
+
audio_segment = AudioSegment.from_file(io.BytesIO(audio_response), format="wav")
|
635 |
+
audio_array = np.array(audio_segment.get_array_of_samples())
|
636 |
+
|
637 |
+
if audio_segment.channels == 2:
|
638 |
+
audio_array = audio_array.reshape((-1, 2))
|
639 |
+
|
640 |
+
# Update conversation history
|
641 |
+
state.conversation.append({"role": "user", "content": {"path": temp_audio_path, "mime_type": "audio/wav"}})
|
642 |
+
state.conversation.append({"role": "assistant", "content": {"text": text_response}})
|
643 |
+
|
644 |
+
return (audio_segment.frame_rate, audio_array), VoiceAppState(conversation=state.conversation)
|
645 |
+
|
646 |
+
finally:
|
647 |
+
if os.path.exists(temp_audio_path):
|
648 |
+
os.unlink(temp_audio_path)
|
649 |
+
|
650 |
+
except Exception as e:
|
651 |
+
print(f"Error generating voice response: {e}")
|
652 |
+
return None, VoiceAppState()
|
653 |
+
|
654 |
+
def start_voice_recording(state: VoiceAppState):
|
655 |
+
"""Start recording user voice input"""
|
656 |
+
if not state.stopped:
|
657 |
+
return gr.Audio(recording=True)
|
658 |
+
return gr.Audio(recording=False)
|
659 |
|
660 |
def chunk_document(text, chunk_size=1024, overlap=100):
|
661 |
"""Split document into overlapping chunks for RAG"""
|
|
|
814 |
# Home Tab
|
815 |
with gr.TabItem("π Home", id="home"):
|
816 |
embedding_status = "β
RAG ready" if embedding_model else "β RAG not loaded"
|
817 |
+
voice_status = "β
Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "β Voice chat not available"
|
818 |
gr.Markdown(
|
819 |
+
"# Scholar Express - Local Gemma 3n Version with Voice\n"
|
820 |
+
"### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot and voice chat powered by local Gemma 3n.\n"
|
821 |
f"**System:** {model_status}\n"
|
822 |
f"**RAG System:** {embedding_status}\n"
|
823 |
+
f"**Voice Chat:** {voice_status}\n"
|
824 |
f"**DOLPHIN:** Local model for PDF processing\n"
|
825 |
+
f"**Gemma 3n:** Local model for alt text generation, chat, and voice\n"
|
826 |
f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
|
827 |
+
f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}\n\n"
|
828 |
+
"**Features:**\n"
|
829 |
+
"- π PDF processing with OCR and layout analysis\n"
|
830 |
+
"- π¬ Text-based chat about your documents\n"
|
831 |
+
"- ποΈ Voice chat with Gemma 3n (new!)\n"
|
832 |
+
"- βΏ AI-generated alt text for accessibility"
|
833 |
)
|
834 |
|
835 |
with gr.Column(elem_classes="upload-container"):
|
|
|
903 |
"*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
|
904 |
elem_id="chat-notice"
|
905 |
)
|
906 |
+
|
907 |
+
# Voice Chat Tab
|
908 |
+
with gr.TabItem("ποΈ Talk with Gemma", id="voice") as voice_tab:
|
909 |
+
voice_status = "β
Voice chat ready" if VOICE_DEPENDENCIES_AVAILABLE and voice_model else "β Voice chat not available"
|
910 |
+
gr.Markdown(f"## Voice Chat with Gemma 3n\n{voice_status}")
|
911 |
+
|
912 |
+
if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
|
913 |
+
with gr.Row():
|
914 |
+
with gr.Column():
|
915 |
+
voice_input_audio = gr.Audio(
|
916 |
+
label="Speak to Gemma",
|
917 |
+
sources=["microphone"],
|
918 |
+
type="numpy",
|
919 |
+
streaming=True
|
920 |
+
)
|
921 |
+
with gr.Column():
|
922 |
+
voice_output_audio = gr.Audio(
|
923 |
+
label="Gemma's Response",
|
924 |
+
streaming=True,
|
925 |
+
autoplay=True
|
926 |
+
)
|
927 |
+
voice_chatbot = gr.Chatbot(
|
928 |
+
label="Voice Conversation",
|
929 |
+
type="messages",
|
930 |
+
height=300
|
931 |
+
)
|
932 |
+
|
933 |
+
with gr.Row():
|
934 |
+
voice_stop_btn = gr.Button("βΉοΈ Stop Conversation", variant="stop")
|
935 |
+
|
936 |
+
gr.Markdown(
|
937 |
+
"*Speak naturally to Gemma 3n. The AI will listen to your voice, process your speech, and respond with both text and voice. You can have conversations before or after processing PDFs.*"
|
938 |
+
)
|
939 |
+
|
940 |
+
# Voice state
|
941 |
+
voice_state = gr.State(value=VoiceAppState())
|
942 |
+
else:
|
943 |
+
gr.Markdown(
|
944 |
+
"### Voice chat is not available\n"
|
945 |
+
"To enable voice chat, please install the required dependencies:\n"
|
946 |
+
"```bash\n"
|
947 |
+
"pip install librosa pydub onnxruntime\n"
|
948 |
+
"```\n"
|
949 |
+
"And ensure the voice_chat directory is properly set up."
|
950 |
+
)
|
951 |
|
952 |
# Event handlers
|
953 |
process_btn.click(
|
|
|
1035 |
lambda: "",
|
1036 |
outputs=[msg_input]
|
1037 |
)
|
1038 |
+
|
1039 |
+
# Voice chat event handlers
|
1040 |
+
if VOICE_DEPENDENCIES_AVAILABLE and voice_model:
|
1041 |
+
# Stream processing
|
1042 |
+
voice_stream = voice_input_audio.stream(
|
1043 |
+
process_voice_audio,
|
1044 |
+
[voice_input_audio, voice_state],
|
1045 |
+
[voice_input_audio, voice_state],
|
1046 |
+
stream_every=0.50,
|
1047 |
+
time_limit=30,
|
1048 |
+
)
|
1049 |
+
|
1050 |
+
# Response generation
|
1051 |
+
voice_respond = voice_input_audio.stop_recording(
|
1052 |
+
generate_voice_response,
|
1053 |
+
[voice_state],
|
1054 |
+
[voice_output_audio, voice_state]
|
1055 |
+
)
|
1056 |
+
|
1057 |
+
# Update chatbot display
|
1058 |
+
voice_respond.then(
|
1059 |
+
lambda s: s.conversation,
|
1060 |
+
[voice_state],
|
1061 |
+
[voice_chatbot]
|
1062 |
+
)
|
1063 |
+
|
1064 |
+
# Restart recording
|
1065 |
+
voice_restart = voice_output_audio.stop(
|
1066 |
+
start_voice_recording,
|
1067 |
+
[voice_state],
|
1068 |
+
[voice_input_audio]
|
1069 |
+
)
|
1070 |
+
|
1071 |
+
# Stop conversation
|
1072 |
+
voice_stop_btn.click(
|
1073 |
+
lambda: (VoiceAppState(stopped=True), gr.Audio(recording=False)),
|
1074 |
+
None,
|
1075 |
+
[voice_state, voice_input_audio],
|
1076 |
+
cancels=[voice_respond, voice_restart]
|
1077 |
+
)
|
1078 |
|
1079 |
|
1080 |
if __name__ == "__main__":
|