Spaces:

Testys
/

drive-paddy

Sleeping

App Files Files Community

Testys commited on 26 days ago

Commit

95b307f

verified ·

1 Parent(s): fb982e6

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -48

app.py CHANGED Viewed

@@ -1,14 +1,18 @@
 # app_gradio.py
 import gradio as gr
 import numpy as np
 import os
 import yaml
 from dotenv import load_dotenv
-import io
-from scipy.io.wavfile import read as read_wav
-import time
-# Correctly import from the drive_paddy package structure
 from src.detection.factory import get_detector
 from src.alerting.alert_system import get_alerter
@@ -28,88 +32,124 @@ detector = get_detector(config)
 alerter = get_alerter(config, secrets["gemini_api_key"])
 print("Initialization complete. Launching UI...")
-STREAM_START_TIME = None
-# --- Audio Processing for Gradio ---
-# Gradio's gr.Audio component needs a specific format: (sample_rate, numpy_array)
-def process_audio_for_gradio(audio_bytes):
-    """Converts in-memory audio bytes to a format Gradio can play."""
-    # gTTS creates MP3, so we read it as such
-    byte_io = io.BytesIO(audio_bytes)
     try:
-        from pydub import AudioSegment
-        audio = AudioSegment.from_mp3(byte_io)
-        wav_byte_io = io.BytesIO()
-        audio.export(wav_byte_io, format="wav")
-        wav_byte_io.seek(0)
-        sample_rate, data = read_wav(wav_byte_io)
-        return (sample_rate, data)
-    except Exception as e:
-        print(f"Could not process audio for Gradio: {e}")
-        return None
 def process_live_frame(frame):
     """
-    Takes a single frame from the Gradio webcam input, processes it,
-    and returns the processed frame, status text, and any audio alerts.
     """
     if frame is None:
-        # Return default values if frame is None
-        blank_image = np.zeros((480, 640, 3), dtype=np.uint8)
-        return blank_image, "Status: Inactive", None
-    # Process the frame using our existing detector
     processed_frame, indicators, _ = detector.process_frame(frame)
     drowsiness_level = indicators.get("drowsiness_level", "Awake")
     lighting = indicators.get("lighting", "Good")
     score = indicators.get("details", {}).get("Score", 0)
-    # Build the status text
     status_text = f"Lighting: {lighting}\n"
     if lighting == "Low":
         status_text += "Detection paused due to low light."
     else:
         status_text += f"Status: {drowsiness_level}\nScore: {score:.2f}"
-    # Handle alerts
     audio_output = None
     if drowsiness_level != "Awake":
-        audio_data = alerter.trigger_alert(level=drowsiness_level)
-        if audio_data:
-            audio_output = process_audio_for_gradio(audio_data)
-    else:
-        alerter.reset_alert()
-    # Return all the values needed to update the UI
     return processed_frame, status_text, audio_output
 # --- Gradio UI Definition ---
-with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="blue")) as app:
-    gr.Markdown("# 🚗 Drive Paddy - Drowsiness Detection (Gradio)")
-    gr.Markdown("A live test using Gradio's webcam component. This can be more stable than WebRTC in some environments.")
     with gr.Row():
-        with gr.Column():
-            # Input: Live webcam feed
             webcam_input = gr.Image(sources=["webcam"], streaming=True, label="Live Camera Feed")
-        with gr.Column():
-            # Output 1: Processed video feed
             processed_output = gr.Image(label="Processed Feed")
-            # Output 2: Live status text
             status_output = gr.Textbox(label="Live Status", lines=3, interactive=False)
-            # Output 3: Hidden audio player for alerts
-            audio_alert_output = gr.Audio(autoplay=True, visible=False)
-    # Link the input to the processing function and the function to the outputs
     webcam_input.stream(
         fn=process_live_frame,
         inputs=[webcam_input],
         outputs=[processed_output, status_output, audio_alert_output]
     )
 # --- Launch the App ---
 if __name__ == "__main__":
     app.launch(debug=True)

 # app_gradio.py
 import gradio as gr
 import numpy as np
+import torch
+import soundfile as sf
 import os
 import yaml
 from dotenv import load_dotenv
+from threading import Thread
+# --- TTS & AI Imports ---
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
+from streamer import ParlerTTSStreamer # Make sure streamer.py is available
 from src.detection.factory import get_detector
 from src.alerting.alert_system import get_alerter
 alerter = get_alerter(config, secrets["gemini_api_key"])
 print("Initialization complete. Launching UI...")
+# --- Parler-TTS Model Setup (Requires GPU) ---
+print("Loading Parler-TTS model. This may take a moment...")
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+if device == "cpu":
+    print("\nWARNING: Running Parler-TTS on a CPU will be extremely slow. A GPU is highly recommended.\n")
+torch_dtype = torch.float16 if device != "cpu" else torch.float32
+# Using a smaller, faster model suitable for real-time alerts
+repo_id = "parler-tts/parler_tts_mini_v0.1"
+model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
+tokenizer = AutoTokenizer.from_pretrained(repo_id)
+feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
+print("Parler-TTS model loaded.")
+# --- Audio Streaming Generator Function ---
+def stream_alert_audio(text_prompt):
+    """
+    A generator function that yields audio chunks for a given text prompt.
+    This is the core of the streaming implementation.
+    """
+    sampling_rate = model.config.sampling_rate
+    description = "Jenny is A female speaker with a clear and urgent voice." # Voice prompt for TTS
+    prompt_ids = tokenizer(text_prompt, return_tensors="pt").input_ids.to(device)
+    description_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+    # Setup the streamer
+    streamer = ParlerTTSStreamer(model, device, play_steps=int(sampling_rate * 2.0))
+    generation_kwargs = dict(
+        input_ids=description_ids,
+        prompt_input_ids=prompt_ids,
+        streamer=streamer,
+        do_sample=True,
+        temperature=1.0, # Increase for more vocal variety
+        repetition_penalty=1.2,
+    )
+    # Run generation in a separate thread to not block the UI
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     try:
+        thread.start()
+        print(f"Audio stream started for: '{text_prompt}'")
+        # Yield audio chunks as they become available
+        for new_audio_chunk in streamer:
+            yield (sampling_rate, new_audio_chunk)
+    finally:
+        # CRITICAL: This block runs after the generator is exhausted (audio finishes)
+        # We reset the alerter state so that a new alert can be triggered later.
+        print("Audio stream finished. Resetting alerter state.")
+        alerter.reset_alert()
+# --- Main Webcam Processing Function ---
 def process_live_frame(frame):
     """
+    Processes each webcam frame, performs drowsiness detection, and
+    returns a generator for audio streaming when an alert is triggered.
     """
     if frame is None:
+        return np.zeros((480, 640, 3), dtype=np.uint8), "Status: Inactive", None
     processed_frame, indicators, _ = detector.process_frame(frame)
     drowsiness_level = indicators.get("drowsiness_level", "Awake")
     lighting = indicators.get("lighting", "Good")
     score = indicators.get("details", {}).get("Score", 0)
+    # Build status text
     status_text = f"Lighting: {lighting}\n"
     if lighting == "Low":
         status_text += "Detection paused due to low light."
     else:
         status_text += f"Status: {drowsiness_level}\nScore: {score:.2f}"
+    # --- Alert Trigger Logic ---
     audio_output = None
     if drowsiness_level != "Awake":
+        # alerter.trigger_alert() returns the alert TEXT if not on cooldown, otherwise None.
+        alert_text = alerter.trigger_alert(level=drowsiness_level)
+        if alert_text:
+            # If we got text, it means we can start an alert.
+            # We return the generator function itself. Gradio will handle it.
+            audio_output = stream_alert_audio(alert_text)
+    # On subsequent frames where the user is drowsy, trigger_alert() will return None
+    # due to the cooldown, preventing a new stream from starting, which is what we want.
     return processed_frame, status_text, audio_output
 # --- Gradio UI Definition ---
+with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as app:
+    gr.Markdown("# 🚗 Drive Paddy - Drowsiness Detection (Streaming)")
+    gr.Markdown("Live drowsiness detection with real-time, streaming voice alerts.")
     with gr.Row():
+        with gr.Column(scale=2):
             webcam_input = gr.Image(sources=["webcam"], streaming=True, label="Live Camera Feed")
+        with gr.Column(scale=1):
             processed_output = gr.Image(label="Processed Feed")
             status_output = gr.Textbox(label="Live Status", lines=3, interactive=False)
+            # --- KEY CHANGE: The Audio component now uses streaming=True ---
+            audio_alert_output = gr.Audio(
+                label="Alert System",
+                autoplay=True,
+                visible=False, # Hide the player controls
+                streaming=True
+            )
     webcam_input.stream(
         fn=process_live_frame,
         inputs=[webcam_input],
         outputs=[processed_output, status_output, audio_alert_output]
     )
 # --- Launch the App ---
 if __name__ == "__main__":
     app.launch(debug=True)