Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 25

Commit

534a53d

1 Parent(s): 2b9c901

Check point 4

Browse files

Files changed (1) hide show

app.py +325 -110

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torchaudio
 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
-from fastrtc import Stream, ReplyOnPause, StreamHandler, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
 import json
 import asyncio
 import uvicorn
@@ -329,48 +329,11 @@ class RealtimeSpeakerDiarization:
         except Exception as e:
             logger.error(f"Model initialization error: {e}")
             return False
-    def feed_audio(self, audio_data):
-        """Feed audio data directly to the recorder for live transcription"""
-        if not self.is_running or not self.recorder:
-            return
-        try:
-            # Normalize if needed
-            if isinstance(audio_data, np.ndarray):
-                if audio_data.dtype != np.float32:
-                    audio_data = audio_data.astype(np.float32)
-                # Convert to int16 for the recorder
-                audio_int16 = (audio_data * 32767).astype(np.int16)
-                audio_bytes = audio_int16.tobytes()
-                # Feed to recorder
-                self.recorder.feed_audio(audio_bytes)
-                # Also process for speaker detection
-                self.process_audio_chunk(audio_data)
-            elif isinstance(audio_data, bytes):
-                # Feed raw bytes directly
-                self.recorder.feed_audio(audio_data)
-                # Convert to float for speaker detection
-                audio_int16 = np.frombuffer(audio_data, dtype=np.int16)
-                audio_float = audio_int16.astype(np.float32) / 32768.0
-                self.process_audio_chunk(audio_float)
-            logger.debug("Audio fed to recorder")
-        except Exception as e:
-            logger.error(f"Error feeding audio: {e}")
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
         with self.transcription_lock:
             self.last_transcription = text.strip()
-        # Update the display immediately on new transcription
-        self.update_conversation_display()
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
@@ -600,47 +563,112 @@ class RealtimeSpeakerDiarization:
             logger.error(f"Error processing audio chunk: {e}")
-# Create diarization handler for FastRTC
-class DiarizationAudioHandler(StreamHandler):
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
-    def receive(self, frame):
-        """Process incoming audio frame"""
-        if not self.diarization_system.is_running:
-            return
         try:
             # Extract audio data
-            sample_rate, audio_array = frame
-            # Send audio to diarization system for processing
-            self.diarization_system.feed_audio(audio_array)
         except Exception as e:
-            logger.error(f"Error processing FastRTC audio: {e}")
-    def copy(self):
-        """Return a fresh handler instance"""
-        return DiarizationAudioHandler(self.diarization_system)
-    def shutdown(self):
-        """Clean up resources"""
-        pass
-    def start_up(self):
-        """Initialize resources"""
-        logger.info("DiarizationAudioHandler started")
-# Global diarization system instance
 diarization_system = RealtimeSpeakerDiarization()
 def initialize_system():
     """Initialize the diarization system"""
     try:
         success = diarization_system.initialize_models()
         if success:
             return "✅ System initialized successfully!"
         else:
             return "❌ Failed to initialize system. Check logs for details."
@@ -656,6 +684,10 @@ def start_recording():
     except Exception as e:
         return f"❌ Failed to start recording: {str(e)}"
 def stop_recording():
     """Stop recording and transcription"""
     try:
@@ -694,56 +726,238 @@ def get_status():
     except Exception as e:
         return f"Error getting status: {str(e)}"
-# Create handler wrapper function for FastRTC
-def diarization_handler(audio_data):
-    """Handler function for FastRTC stream"""
-    try:
-        # Process the audio data
-        diarization_system.process_audio_chunk(audio_data[1], audio_data[0])
-        # Just yield the original audio back (echo)
-        # This can be changed to just return None since we don't need echo
-        # This can be changed to just return None since we don't need echo
-        yield audio_data
-    except Exception as e:
-        logger.error(f"Error in diarization handler: {e}")
-# Create FastRTC stream with ReplyOnPause pattern
-stream = Stream(
-    handler=ReplyOnPause(diarization_handler),
-    modality="audio",
-    mode="send-receive",
-    rtc_configuration=get_cloudflare_turn_credentials_async,
-    server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
-    ui_args={
-        "title": "Real-time Speaker Diarization",
-        "description": "Live transcription with automatic speaker identification"
-    }
-)
 # Main execution
 if __name__ == "__main__":
     import argparse
-    import os
     parser = argparse.ArgumentParser(description="Real-time Speaker Diarization System")
-    parser.add_argument("--mode", choices=["ui", "api", "both"], default="ui",
-                       help="Run mode: FastRTC UI, API only, or both")
     parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
-    parser.add_argument("--port", type=int, default=int(os.environ.get("GRADIO_SERVER_PORT", 7860)),
-                      help="Port to bind to")
     parser.add_argument("--api-port", type=int, default=8000, help="API port (when running both)")
     args = parser.parse_args()
-    # Initialize the system before running anything
-    initialize_system()
-    start_recording()
-    if args.mode == "ui":
-        # Launch the FastRTC built-in UI
-        stream.ui.launch(
             server_name=args.host,
             server_port=args.port,
             share=True,
@@ -752,8 +966,6 @@ if __name__ == "__main__":
     elif args.mode == "api":
         # Run FastAPI only
-        app = FastAPI()
-        stream.mount(app)
         uvicorn.run(
             app,
             host=args.host,
@@ -762,12 +974,20 @@ if __name__ == "__main__":
         )
     elif args.mode == "both":
-        # Run both FastRTC UI and API
         import threading
         def run_fastapi():
-            app = FastAPI()
-            stream.mount(app)
             uvicorn.run(
                 app,
                 host=args.host,
@@ -779,10 +999,5 @@ if __name__ == "__main__":
         api_thread = threading.Thread(target=run_fastapi, daemon=True)
         api_thread.start()
-        # Start FastRTC UI in main thread
-        stream.ui.launch(
-            server_name=args.host,
-            server_port=args.port,
-            share=True,
-            show_error=True
-        )

 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
+from fastrtc import Stream, AsyncStreamHandler, WebRTC
 import json
 import asyncio
 import uvicorn
         except Exception as e:
             logger.error(f"Model initialization error: {e}")
             return False
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
         with self.transcription_lock:
             self.last_transcription = text.strip()
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
             logger.error(f"Error processing audio chunk: {e}")
+# FastRTC Audio Handler
+class DiarizationHandler(AsyncStreamHandler):
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
+        self.audio_buffer = []
+        self.buffer_size = BUFFER_SIZE
+    def copy(self):
+        """Return a fresh handler for each new stream connection"""
+        return DiarizationHandler(self.diarization_system)
+    async def emit(self):
+        """Not used - we only receive audio"""
+        return None
+    async def receive(self, frame):
+        """Receive audio data from FastRTC"""
         try:
+            if not self.diarization_system.is_running:
+                return
             # Extract audio data
+            audio_data = getattr(frame, 'data', frame)
+            # Check if this is a tuple (sample_rate, audio_array)
+            if isinstance(audio_data, tuple) and len(audio_data) >= 2:
+                sample_rate, audio_array = audio_data
+            else:
+                # If not a tuple, assume it's raw audio bytes/array
+                sample_rate = SAMPLE_RATE  # Use default sample rate
+                # Convert to numpy array
+                if isinstance(audio_data, bytes):
+                    audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
+                elif isinstance(audio_data, (list, tuple)):
+                    audio_array = np.array(audio_data, dtype=np.float32)
+                else:
+                    audio_array = np.array(audio_data, dtype=np.float32)
+            # Ensure 1D
+            if len(audio_array.shape) > 1:
+                audio_array = audio_array.flatten()
+            # Send audio to recorder for live transcription
+            if self.diarization_system.recorder:
+                try:
+                    self.diarization_system.recorder.feed_audio(audio_array)
+                    logger.info("Fed audio to recorder")
+                except Exception as e:
+                    logger.error(f"Error feeding audio to recorder: {e}")
+            # Buffer audio chunks
+            self.audio_buffer.extend(audio_array)
+            # Process in chunks
+            while len(self.audio_buffer) >= self.buffer_size:
+                chunk = np.array(self.audio_buffer[:self.buffer_size])
+                self.audio_buffer = self.audio_buffer[self.buffer_size:]
+                # Process asynchronously
+                await self.process_audio_async(chunk)
         except Exception as e:
+            logger.error(f"Error in FastRTC receive: {e}")
+    async def process_audio_async(self, audio_data):
+        """Process audio data asynchronously"""
+        try:
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(
+                None,
+                self.diarization_system.process_audio_chunk,
+                audio_data,
+                SAMPLE_RATE
+            )
+        except Exception as e:
+            logger.error(f"Error in async audio processing: {e}")
+    async def start_up(self):
+        logger.info("DiarizationHandler started")
+    async def shutdown(self):
+        logger.info("DiarizationHandler shutdown")
+# Global instances
 diarization_system = RealtimeSpeakerDiarization()
+# We'll initialize the stream properly in initialize_system()
 def initialize_system():
     """Initialize the diarization system"""
+    global stream
     try:
         success = diarization_system.initialize_models()
         if success:
+            # Create a DiarizationHandler linked to our system
+            handler = DiarizationHandler(diarization_system)
+            # Update the Stream's handler
+            stream = Stream(
+                handler=handler,
+                modality="audio",
+                mode="send-receive",
+                stream_name="audio_stream"  # Match the stream_name in WebRTC component
+            )
             return "✅ System initialized successfully!"
         else:
             return "❌ Failed to initialize system. Check logs for details."
     except Exception as e:
         return f"❌ Failed to start recording: {str(e)}"
+def on_start():
+    result = start_recording()
+    return result, gr.update(interactive=False), gr.update(interactive=True)
 def stop_recording():
     """Stop recording and transcription"""
     try:
     except Exception as e:
         return f"Error getting status: {str(e)}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
+        gr.Markdown("Live transcription with automatic speaker identification using FastRTC audio streaming.")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Replace standard Audio with WebRTC component
+                audio_component = WebRTC(
+                    label="Audio Input",
+                    stream_name="audio_stream",
+                    modality="audio",
+                    mode="send-receive"
+                )
+                # Conversation display
+                conversation_output = gr.HTML(
+                    value="<div style='padding: 20px; background: #f8f9fa; border-radius: 10px; min-height: 300px;'><i>Click 'Initialize System' to start...</i></div>",
+                    label="Live Conversation"
+                )
+                # Control buttons
+                with gr.Row():
+                    init_btn = gr.Button("🔧 Initialize System", variant="secondary", size="lg")
+                    start_btn = gr.Button("🎙️ Start", variant="primary", size="lg", interactive=False)
+                    stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", interactive=False)
+                    clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg", interactive=False)
+                # Status display
+                status_output = gr.Textbox(
+                    label="System Status",
+                    value="Ready to initialize...",
+                    lines=8,
+                    interactive=False
+                )
+            with gr.Column(scale=1):
+                # Settings
+                gr.Markdown("## ⚙️ Settings")
+                threshold_slider = gr.Slider(
+                    minimum=0.3,
+                    maximum=0.9,
+                    step=0.05,
+                    value=DEFAULT_CHANGE_THRESHOLD,
+                    label="Speaker Change Sensitivity",
+                    info="Lower = more sensitive"
+                )
+                max_speakers_slider = gr.Slider(
+                    minimum=2,
+                    maximum=ABSOLUTE_MAX_SPEAKERS,
+                    step=1,
+                    value=DEFAULT_MAX_SPEAKERS,
+                    label="Maximum Speakers"
+                )
+                update_btn = gr.Button("Update Settings", variant="secondary")
+                # Instructions
+                gr.Markdown("""
+                ## 📋 Instructions
+                1. **Initialize** the system (loads AI models)
+                2. **Start** recording
+                3. **Speak** - system will transcribe and identify speakers
+                4. **Monitor** real-time results below
+                ## 🎨 Speaker Colors
+                - 🔴 Speaker 1 (Red)
+                - 🟢 Speaker 2 (Teal)
+                - 🔵 Speaker 3 (Blue)
+                - 🟡 Speaker 4 (Green)
+                - 🟣 Speaker 5 (Yellow)
+                - 🟤 Speaker 6 (Plum)
+                - 🟫 Speaker 7 (Mint)
+                - 🟨 Speaker 8 (Gold)
+                """)
+        # Event handlers
+        def on_initialize():
+            result = initialize_system()
+            if "✅" in result:
+                return result, gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
+            else:
+                return result, gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
+        def on_start():
+            result = start_recording()
+            return result, gr.update(interactive=False), gr.update(interactive=True)
+        def on_stop():
+            result = stop_recording()
+            return result, gr.update(interactive=True), gr.update(interactive=False)
+        def on_clear():
+            result = clear_conversation()
+            return result
+        def on_update_settings(threshold, max_speakers):
+            result = update_settings(threshold, int(max_speakers))
+            return result
+        def refresh_conversation():
+            return get_conversation()
+        def refresh_status():
+            return get_status()
+        # Button click handlers
+        init_btn.click(
+            fn=on_initialize,
+            outputs=[status_output, start_btn, stop_btn, clear_btn]
+        )
+        start_btn.click(
+            fn=on_start,
+            outputs=[status_output, start_btn, stop_btn]
+        )
+        stop_btn.click(
+            fn=on_stop,
+            outputs=[status_output, start_btn, stop_btn]
+        )
+        clear_btn.click(
+            fn=on_clear,
+            outputs=[status_output]
+        )
+        update_btn.click(
+            fn=on_update_settings,
+            inputs=[threshold_slider, max_speakers_slider],
+            outputs=[status_output]
+        )
+        # Auto-refresh conversation display every 1 second
+        conversation_timer = gr.Timer(1)
+        conversation_timer.tick(refresh_conversation, outputs=[conversation_output])
+        # Auto-refresh status every 2 seconds
+        status_timer = gr.Timer(2)
+        status_timer.tick(refresh_status, outputs=[status_output])
+    return interface
+# FastAPI setup for FastRTC integration
+app = FastAPI()
+# Create a placeholder handler - will be properly initialized later
+class DefaultHandler(AsyncStreamHandler):
+    def __init__(self):
+        super().__init__()
+    async def receive(self, frame):
+        pass
+    async def emit(self):
+        return None
+    def copy(self):
+        return DefaultHandler()
+    async def shutdown(self):
+        pass
+    async def start_up(self):
+        pass
+# Initialize with placeholder handler
+stream = Stream(handler=DefaultHandler(), modality="audio", mode="send-receive")
+stream.mount(app)
+@app.get("/")
+async def root():
+    return {"message": "Real-time Speaker Diarization API"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "system_running": diarization_system.is_running}
+@app.post("/initialize")
+async def api_initialize():
+    result = initialize_system()
+    return {"result": result, "success": "✅" in result}
+@app.post("/start")
+async def api_start():
+    result = start_recording()
+    return {"result": result, "success": "🎙️" in result}
+@app.post("/stop")
+async def api_stop():
+    result = stop_recording()
+    return {"result": result, "success": "⏹️" in result}
+@app.post("/clear")
+async def api_clear():
+    result = clear_conversation()
+    return {"result": result}
+@app.get("/conversation")
+async def api_get_conversation():
+    return {"conversation": get_conversation()}
+@app.get("/status")
+async def api_get_status():
+    return {"status": get_status()}
+@app.post("/settings")
+async def api_update_settings(threshold: float, max_speakers: int):
+    result = update_settings(threshold, max_speakers)
+    return {"result": result}
 # Main execution
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="Real-time Speaker Diarization System")
+    parser.add_argument("--mode", choices=["gradio", "api", "both"], default="gradio",
+                       help="Run mode: gradio interface, API only, or both")
     parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
     parser.add_argument("--api-port", type=int, default=8000, help="API port (when running both)")
     args = parser.parse_args()
+    if args.mode == "gradio":
+        # Run Gradio interface only
+        interface = create_interface()
+        interface.launch(
             server_name=args.host,
             server_port=args.port,
             share=True,
     elif args.mode == "api":
         # Run FastAPI only
         uvicorn.run(
             app,
             host=args.host,
         )
     elif args.mode == "both":
+        # Run both Gradio and FastAPI
+        import multiprocessing
         import threading
+        def run_gradio():
+            interface = create_interface()
+            interface.launch(
+                server_name=args.host,
+                server_port=args.port,
+                share=True,
+                show_error=True
+            )
         def run_fastapi():
             uvicorn.run(
                 app,
                 host=args.host,
         api_thread = threading.Thread(target=run_fastapi, daemon=True)
         api_thread.start()
+        # Start Gradio in main thread
+        run_gradio()