Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 23

Commit

b9dea2c

1 Parent(s): d65b6e8

Fixing gradio RealStream

Browse files

Files changed (1) hide show

app.py +241 -554

app.py CHANGED Viewed

@@ -1,24 +1,18 @@
 import gradio as gr
 import numpy as np
-import queue
 import torch
 import time
-import threading
 import os
 import urllib.request
-import torchaudio
 from scipy.spatial.distance import cosine
-import json
 import asyncio
-from typing import Iterator
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Simplified configuration parameters
-SILENCE_THRESHS = [0, 0.4]
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 FINAL_BEAM_SIZE = 5
 REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
@@ -35,33 +29,10 @@ EMBEDDING_HISTORY_SIZE = 5
 MIN_SEGMENT_DURATION = 1.0
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
-# Global variables
-FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
-BUFFER_SIZE = 1024
-CHANNELS = 1
-CHUNK_DURATION_MS = 100  # 100ms chunks for FastRTC
-# Speaker colors
-SPEAKER_COLORS = [
-    "#FFFF00",  # Yellow
-    "#FF0000",  # Red
-    "#00FF00",  # Green
-    "#00FFFF",  # Cyan
-    "#FF00FF",  # Magenta
-    "#0000FF",  # Blue
-    "#FF8000",  # Orange
-    "#00FF80",  # Spring Green
-    "#8000FF",  # Purple
-    "#FFFFFF",  # White
-]
-SPEAKER_COLOR_NAMES = [
-    "Yellow", "Red", "Green", "Cyan", "Magenta",
-    "Blue", "Orange", "Spring Green", "Purple", "White"
-]
 class SpeechBrainEncoder:
     """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
@@ -73,24 +44,11 @@ class SpeechBrainEncoder:
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
-    def _download_model(self):
-        """Download pre-trained SpeechBrain ECAPA-TDNN model if not present"""
-        model_url = "https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/embedding_model.ckpt"
-        model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
-        if not os.path.exists(model_path):
-            logger.info(f"Downloading ECAPA-TDNN model to {model_path}...")
-            urllib.request.urlretrieve(model_url, model_path)
-        return model_path
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
-            model_path = self._download_model()
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
@@ -100,7 +58,7 @@ class SpeechBrainEncoder:
             self.model_loaded = True
             return True
         except Exception as e:
-            logger.error(f"Error loading ECAPA-TDNN model: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
@@ -122,31 +80,12 @@ class SpeechBrainEncoder:
             return embedding.squeeze().cpu().numpy()
         except Exception as e:
-            logger.error(f"Error extracting embedding: {e}")
             return np.zeros(self.embedding_dim)
-class AudioProcessor:
-    """Processes audio data to extract speaker embeddings"""
-    def __init__(self, encoder):
-        self.encoder = encoder
-    def extract_embedding(self, audio_float):
-        try:
-            # Ensure audio is in the right format
-            if np.abs(audio_float).max() > 1.0:
-                audio_float = audio_float / np.abs(audio_float).max()
-            embedding = self.encoder.embed_utterance(audio_float)
-            return embedding
-        except Exception as e:
-            logger.error(f"Embedding extraction error: {e}")
-            return np.zeros(self.encoder.embedding_dim)
 class SpeakerChangeDetector:
-    """Speaker change detector that supports a configurable number of speakers"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
@@ -254,569 +193,317 @@ class SpeakerChangeDetector:
             )
         return self.current_speaker, similarity
-    def get_color_for_speaker(self, speaker_id):
-        """Return color for speaker ID"""
-        if 0 <= speaker_id < len(SPEAKER_COLORS):
-            return SPEAKER_COLORS[speaker_id]
-        return "#FFFFFF"
-    def get_status_info(self):
-        """Return status information about the speaker change detector"""
-        speaker_counts = [len(self.speaker_embeddings[i]) for i in range(self.max_speakers)]
-        return {
-            "current_speaker": self.current_speaker,
-            "speaker_counts": speaker_counts,
-            "active_speakers": len(self.active_speakers),
-            "max_speakers": self.max_speakers,
-            "last_similarity": self.last_similarity,
-            "threshold": self.change_threshold
-        }
-class WhisperTranscriber:
-    """Whisper transcriber using transformers with FastRTC optimization"""
-    def __init__(self, model_name="distil-large-v3"):
-        self.model = None
-        self.processor = None
-        self.model_name = model_name
-        self.model_loaded = False
-    def load_model(self):
-        """Load Whisper model"""
-        try:
-            from transformers import WhisperProcessor, WhisperForConditionalGeneration
-            model_id = f"distil-whisper/distil-{self.model_name}" if "distil" in self.model_name else f"openai/whisper-{self.model_name}"
-            self.processor = WhisperProcessor.from_pretrained(model_id)
-            self.model = WhisperForConditionalGeneration.from_pretrained(
-                model_id,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                low_cpu_mem_usage=True,
-                use_safetensors=True
-            )
-            if torch.cuda.is_available():
-                self.model = self.model.cuda()
-            self.model_loaded = True
-            return True
-        except Exception as e:
-            logger.error(f"Error loading Whisper model: {e}")
-            return False
-    def transcribe(self, audio_array, sample_rate=16000):
-        """Transcribe audio array"""
-        if not self.model_loaded:
-            return ""
         try:
-            # Ensure audio is the right length and format
-            if len(audio_array) < 1600:  # Less than 0.1 seconds
-                return ""
-            # Resample if needed
-            if sample_rate != 16000:
-                import torchaudio.functional as F
-                audio_tensor = torch.tensor(audio_array, dtype=torch.float32)
-                audio_array = F.resample(audio_tensor, sample_rate, 16000).numpy()
-            # Process with Whisper
-            inputs = self.processor(
-                audio_array,
-                sampling_rate=16000,
-                return_tensors="pt",
-                truncation=False,
-                padding=True
-            )
-            if torch.cuda.is_available():
-                inputs = {k: v.cuda() for k, v in inputs.items()}
-            with torch.no_grad():
-                predicted_ids = self.model.generate(
-                    inputs["input_features"],
-                    max_length=448,
-                    num_beams=1,
-                    do_sample=False,
-                    use_cache=True
-                )
-            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-            return transcription.strip()
         except Exception as e:
-            logger.error(f"Transcription error: {e}")
-            return ""
-class FastRTCSpeakerDiarization:
-    def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
-        self.transcriber = None
-        self.audio_queue = queue.Queue(maxsize=100)
         self.processing_thread = None
-        self.full_sentences = []
-        self.sentence_speakers = []
-        self.is_running = False
-        self.change_threshold = DEFAULT_CHANGE_THRESHOLD
-        self.max_speakers = DEFAULT_MAX_SPEAKERS
-        self.audio_buffer = []
-        self.buffer_duration = 3.0  # seconds
-        self.last_transcription_time = time.time()
-        self.chunk_size = int(SAMPLE_RATE * CHUNK_DURATION_MS / 1000)
-    def initialize_models(self):
-        """Initialize the speaker encoder and transcription models"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
-            logger.info(f"Using device: {device_str}")
-            # Initialize speaker encoder
             self.encoder = SpeechBrainEncoder(device=device_str)
-            encoder_success = self.encoder.load_model()
-            # Initialize transcriber
-            self.transcriber = WhisperTranscriber(FINAL_TRANSCRIPTION_MODEL)
-            transcriber_success = self.transcriber.load_model()
-            if encoder_success and transcriber_success:
-                self.audio_processor = AudioProcessor(self.encoder)
-                self.speaker_detector = SpeakerChangeDetector(
-                    embedding_dim=self.encoder.embedding_dim,
-                    change_threshold=self.change_threshold,
-                    max_speakers=self.max_speakers
-                )
-                logger.info("Models loaded successfully!")
-                return True
-            else:
-                logger.error("Failed to load models")
                 return False
         except Exception as e:
-            logger.error(f"Model initialization error: {e}")
             return False
-    def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int):
-        """Process individual audio chunk from FastRTC"""
-        if not self.is_running or audio_chunk is None:
-            return
-        try:
-            # Ensure audio chunk is in correct format
-            if isinstance(audio_chunk, np.ndarray):
-                # Ensure mono audio
-                if len(audio_chunk.shape) > 1:
-                    audio_chunk = audio_chunk.mean(axis=1)
-                # Normalize audio
-                if audio_chunk.dtype != np.float32:
-                    audio_chunk = audio_chunk.astype(np.float32)
-                if np.abs(audio_chunk).max() > 1.0:
-                    audio_chunk = audio_chunk / np.abs(audio_chunk).max()
-                # Add to buffer
-                self.audio_buffer.extend(audio_chunk)
-                # Keep buffer to specified duration
-                max_buffer_length = int(self.buffer_duration * sample_rate)
-                if len(self.audio_buffer) > max_buffer_length:
-                    self.audio_buffer = self.audio_buffer[-max_buffer_length:]
-                # Process if enough audio accumulated and enough time passed
-                current_time = time.time()
-                if (current_time - self.last_transcription_time > 1.5 and
-                    len(self.audio_buffer) > sample_rate * 0.8):  # At least 0.8 seconds
-                    if not self.audio_queue.full():
-                        self.audio_queue.put((np.array(self.audio_buffer[-int(sample_rate * 2):]), sample_rate))
-                        self.last_transcription_time = current_time
-        except Exception as e:
-            logger.error(f"Audio chunk processing error: {e}")
-    def process_audio_queue(self):
-        """Process audio from the queue"""
-        while self.is_running:
-            try:
-                audio_data, sample_rate = self.audio_queue.get(timeout=1)
-                if len(audio_data) < 1600:  # Skip very short audio
-                    continue
-                # Transcribe audio
-                transcription = self.transcriber.transcribe(audio_data, sample_rate)
-                if transcription and len(transcription.strip()) > 0:
-                    # Extract speaker embedding
-                    speaker_embedding = self.audio_processor.extract_embedding(audio_data)
-                    # Detect speaker
-                    speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
-                    # Store results
-                    self.full_sentences.append(transcription.strip())
-                    self.sentence_speakers.append(speaker_id)
-                    logger.info(f"Processed: Speaker {speaker_id + 1}: {transcription.strip()[:50]}...")
-            except queue.Empty:
-                continue
-            except Exception as e:
-                logger.error(f"Error processing audio queue: {e}")
-    def start_recording(self):
-        """Start the recording and processing"""
-        if self.encoder is None or self.transcriber is None:
-            return "Please initialize models first!"
         try:
-            self.is_running = True
-            self.audio_buffer = []
-            self.last_transcription_time = time.time()
-            # Clear the queue
-            while not self.audio_queue.empty():
-                try:
-                    self.audio_queue.get_nowait()
-                except queue.Empty:
-                    break
-            # Start processing thread
-            self.processing_thread = threading.Thread(target=self.process_audio_queue, daemon=True)
-            self.processing_thread.start()
-            logger.info("Recording started successfully!")
-            return "Recording started successfully!"
         except Exception as e:
-            logger.error(f"Error starting recording: {e}")
-            return f"Error starting recording: {e}"
-    def stop_recording(self):
-        """Stop the recording process"""
-        self.is_running = False
-        logger.info("Recording stopped!")
-        return "Recording stopped!"
-    def clear_conversation(self):
-        """Clear all conversation data"""
-        self.full_sentences = []
-        self.sentence_speakers = []
-        self.audio_buffer = []
-        # Clear the queue
-        while not self.audio_queue.empty():
-            try:
-                self.audio_queue.get_nowait()
-            except queue.Empty:
-                break
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
-        return "Conversation cleared!"
-    def update_settings(self, threshold, max_speakers):
-        """Update speaker detection settings"""
-        self.change_threshold = threshold
-        self.max_speakers = max_speakers
-        if self.speaker_detector:
-            self.speaker_detector.set_change_threshold(threshold)
-            self.speaker_detector.set_max_speakers(max_speakers)
-        return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
-    def get_formatted_conversation(self):
-        """Get the formatted conversation with speaker colors"""
-        try:
-            if not self.full_sentences:
-                return "Waiting for speech input... 🎤"
-            sentences_with_style = []
-            for i, sentence in enumerate(self.full_sentences[-10:]):  # Show last 10 sentences
-                if i >= len(self.sentence_speakers):
-                    color = "#FFFFFF"
-                    speaker_name = "Unknown"
-                else:
-                    speaker_id = self.sentence_speakers[-(10-i) if len(self.sentence_speakers) >= 10 else i]
-                    color = self.speaker_detector.get_color_for_speaker(speaker_id)
-                    speaker_name = f"Speaker {speaker_id + 1}"
-                sentences_with_style.append(
-                    f'<p><span style="color:{color}; font-weight: bold;">{speaker_name}:</span> {sentence}</p>')
-            return "".join(sentences_with_style)
-        except Exception as e:
-            return f"Error formatting conversation: {e}"
-    def get_status_info(self):
-        """Get current status information"""
-        if not self.speaker_detector:
-            return "Speaker detector not initialized"
-        try:
-            status = self.speaker_detector.get_status_info()
-            queue_size = self.audio_queue.qsize()
-            status_lines = [
-                f"**Current Speaker:** {status['current_speaker'] + 1}",
-                f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
-                f"**Last Similarity:** {status['last_similarity']:.3f}",
-                f"**Change Threshold:** {status['threshold']:.2f}",
-                f"**Total Sentences:** {len(self.full_sentences)}",
-                f"**Buffer Length:** {len(self.audio_buffer)} samples",
-                f"**Queue Size:** {queue_size}",
-                "",
-                "**Speaker Segment Counts:**"
-            ]
-            for i in range(status['max_speakers']):
-                color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
-                status_lines.append(f"Speaker {i+1} ({color_name}): {status['speaker_counts'][i]}")
-            return "\n".join(status_lines)
-        except Exception as e:
-            return f"Error getting status: {e}"
 # Global instance
-diarization_system = FastRTCSpeakerDiarization()
-def initialize_system():
     """Initialize the diarization system"""
-    success = diarization_system.initialize_models()
     if success:
-        return "✅ System initialized successfully! Models loaded."
     else:
-        return "❌ Failed to initialize system. Please check the logs."
-def start_recording():
-    """Start recording and transcription"""
-    return diarization_system.start_recording()
-def stop_recording():
-    """Stop recording and transcription"""
-    return diarization_system.stop_recording()
 def clear_conversation():
-    """Clear the conversation"""
-    return diarization_system.clear_conversation()
-def update_settings(threshold, max_speakers):
-    """Update system settings"""
-    return diarization_system.update_settings(threshold, max_speakers)
-def get_conversation():
-    """Get the current conversation"""
-    return diarization_system.get_formatted_conversation()
-def get_status():
-    """Get system status"""
-    return diarization_system.get_status_info()
-def process_audio_stream(audio_stream):
-    """Process streaming audio from FastRTC"""
-    if audio_stream is not None and diarization_system.is_running:
-        sample_rate, audio_data = audio_stream
-        diarization_system.process_audio_chunk(audio_data, sample_rate)
-    return get_conversation(), get_status()
-# Create Gradio interface with FastRTC
-def create_interface():
-    with gr.Blocks(title="FastRTC Real-time Speaker Diarization", theme=gr.themes.Soft()) as app:
-        gr.Markdown("# 🎤 FastRTC Real-time Speech Recognition with Speaker Diarization")
-        gr.Markdown("This app uses Hugging Face FastRTC for real-time audio streaming with automatic speaker identification and color-coding.")
         with gr.Row():
-            with gr.Column(scale=2):
-                # FastRTC Audio input for real-time streaming
-                audio_input = gr.Audio(
-                    sources=["microphone"],
-                    type="numpy",
-                    streaming=True,
-                    label="🎙️ FastRTC Microphone Input",
-                    format="wav",
-                    show_download_button=False,
-                    container=True,
-                    elem_id="fastrtc_audio"
-                )
-                # Main conversation display
-                conversation_output = gr.HTML(
-                    value="<i>Click 'Initialize System' and then 'Start Recording' to begin...</i>",
-                    label="Live Conversation",
-                    elem_id="conversation_display"
-                )
-                # Control buttons
-                with gr.Row():
-                    init_btn = gr.Button("🔧 Initialize System", variant="secondary", size="lg")
-                    start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False, size="lg")
-                    stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False, size="lg")
-                    clear_btn = gr.Button("🗑️ Clear", interactive=False, size="lg")
-                # Status display
-                status_output = gr.Textbox(
-                    label="System Status",
-                    value="System not initialized",
-                    lines=10,
-                    interactive=False,
-                    show_copy_button=True
-                )
-            with gr.Column(scale=1):
-                # Settings panel
-                gr.Markdown("## ⚙️ Settings")
-                threshold_slider = gr.Slider(
-                    minimum=0.1,
-                    maximum=0.95,
-                    step=0.05,
                     value=DEFAULT_CHANGE_THRESHOLD,
-                    label="Speaker Change Sensitivity",
-                    info="Lower = more sensitive to changes"
                 )
-                max_speakers_slider = gr.Slider(
                     minimum=2,
                     maximum=ABSOLUTE_MAX_SPEAKERS,
-                    step=1,
                     value=DEFAULT_MAX_SPEAKERS,
-                    label="Maximum Number of Speakers"
                 )
-                update_settings_btn = gr.Button("Update Settings", variant="secondary")
-                # Speaker color legend
-                gr.Markdown("## 🎨 Speaker Colors")
-                color_info = []
-                for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
-                    color_info.append(f'<span style="color:{color}; font-size: 16px;">●</span> Speaker {i+1} ({name})')
-                gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
-                # Performance info
-                gr.Markdown("## 📊 Performance")
-                gr.Markdown("""
-                - **FastRTC**: Low-latency audio streaming
-                - **Whisper**: distil-large-v3 for transcription
-                - **ECAPA-TDNN**: Speaker embeddings
-                - **Real-time**: ~100ms processing chunks
-                """)
-        # Event handlers
-        def on_initialize():
-            result = initialize_system()
-            if "successfully" in result:
-                return (
-                    result,  # status_output
-                    gr.update(interactive=True),   # start_btn
-                    gr.update(interactive=True),   # clear_btn
-                    get_conversation(),  # conversation_output
-                    get_status()  # status_output update
                 )
-            else:
-                return (
-                    result,  # status_output
-                    gr.update(interactive=False),  # start_btn
-                    gr.update(interactive=False),  # clear_btn
-                    get_conversation(),  # conversation_output
-                    get_status()  # status_output update
                 )
-        def on_start():
-            result = start_recording()
-            return (
-                result,  # status_output
-                gr.update(interactive=False),  # start_btn
-                gr.update(interactive=True),   # stop_btn
-            )
-        def on_stop():
-            result = stop_recording()
-            return (
-                result,  # status_output
-                gr.update(interactive=True),   # start_btn
-                gr.update(interactive=False),  # stop_btn
-            )
-        # Auto-refresh function
-        def refresh_display():
-            return get_conversation(), get_status()
-        # Connect event handlers
-        init_btn.click(
-            on_initialize,
-            outputs=[status_output, start_btn, clear_btn, conversation_output, status_output]
         )
-        start_btn.click(
-            on_start,
-            outputs=[status_output, start_btn, stop_btn]
         )
-        stop_btn.click(
-            on_stop,
-            outputs=[status_output, start_btn, stop_btn]
         )
         clear_btn.click(
-            clear_conversation,
-            outputs=[status_output]
         )
-        update_settings_btn.click(
-            update_settings,
-            inputs=[threshold_slider, max_speakers_slider],
-            outputs=[status_output]
         )
-        # FastRTC streaming audio processing
-        audio_input.stream(
-            process_audio_stream,
-            inputs=[audio_input],
-            outputs=[conversation_output, status_output],
-            stream_every=0.1,  # Process every 100ms
-            time_limit=None
-        )
-        # Auto-refresh timer
-        refresh_timer = gr.Timer(2.0)
-        refresh_timer.tick(
-            refresh_display,
-            outputs=[conversation_output, status_output]
-        )
-    return app
 if __name__ == "__main__":
-    app = create_interface()
-    app.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=True
     )

 import gradio as gr
 import numpy as np
 import torch
+import torchaudio
 import time
 import os
 import urllib.request
 from scipy.spatial.distance import cosine
+import threading
+import queue
+from collections import deque
 import asyncio
+from typing import Generator, Tuple, List, Optional
+# Configuration parameters (keeping original models)
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 FINAL_BEAM_SIZE = 5
 REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
 MIN_SEGMENT_DURATION = 1.0
 DEFAULT_MAX_SPEAKERS = 4
 ABSOLUTE_MAX_SPEAKERS = 10
 SAMPLE_RATE = 16000
+# Speaker labels
+SPEAKER_LABELS = [f"Speaker {i+1}" for i in range(ABSOLUTE_MAX_SPEAKERS)]
 class SpeechBrainEncoder:
     """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
             self.model_loaded = True
             return True
         except Exception as e:
+            print(f"Error loading ECAPA-TDNN model: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
             return embedding.squeeze().cpu().numpy()
         except Exception as e:
+            print(f"Error extracting embedding: {e}")
             return np.zeros(self.embedding_dim)
 class SpeakerChangeDetector:
+    """Speaker change detector that supports configurable number of speakers"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
             )
         return self.current_speaker, similarity
+class AudioProcessor:
+    """Processes audio data to extract speaker embeddings"""
+    def __init__(self, encoder):
+        self.encoder = encoder
+    def extract_embedding(self, audio_data):
         try:
+            # Ensure audio is float32 and normalized
+            if audio_data.dtype != np.float32:
+                audio_data = audio_data.astype(np.float32)
+            # Normalize if needed
+            if np.abs(audio_data).max() > 1.0:
+                audio_data = audio_data / np.abs(audio_data).max()
+            # Extract embedding using the loaded encoder
+            embedding = self.encoder.embed_utterance(audio_data)
+            return embedding
         except Exception as e:
+            print(f"Embedding extraction error: {e}")
+            return np.zeros(self.encoder.embedding_dim)
+class RealTimeSpeakerDiarization:
+    """Main class for real-time speaker diarization"""
+    def __init__(self, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
+        self.change_threshold = change_threshold
+        self.max_speakers = max_speakers
+        self.transcript_history = []
+        self.is_initialized = False
+        # Threading components
+        self.audio_queue = queue.Queue()
         self.processing_thread = None
+        self.running = False
+    async def initialize(self):
+        """Initialize the speaker diarization system"""
+        if self.is_initialized:
+            return True
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"Initializing ECAPA-TDNN model on {device_str}...")
             self.encoder = SpeechBrainEncoder(device=device_str)
+            success = self.encoder.load_model()
+            if not success:
                 return False
+            self.audio_processor = AudioProcessor(self.encoder)
+            self.speaker_detector = SpeakerChangeDetector(
+                embedding_dim=self.encoder.embedding_dim,
+                change_threshold=self.change_threshold,
+                max_speakers=self.max_speakers
+            )
+            self.is_initialized = True
+            print("Speaker diarization system initialized successfully!")
+            return True
         except Exception as e:
+            print(f"Initialization error: {e}")
             return False
+    def update_settings(self, change_threshold, max_speakers):
+        """Update diarization settings"""
+        self.change_threshold = change_threshold
+        self.max_speakers = max_speakers
+        if self.speaker_detector:
+            self.speaker_detector.set_change_threshold(change_threshold)
+            self.speaker_detector.set_max_speakers(max_speakers)
+    def process_audio_segment(self, audio_data: np.ndarray, text: str) -> Tuple[int, str]:
+        """Process an audio segment and return speaker ID and formatted text"""
+        if not self.is_initialized:
+            return 0, text
         try:
+            # Extract speaker embedding
+            embedding = self.audio_processor.extract_embedding(audio_data)
+            # Detect speaker
+            speaker_id, similarity = self.speaker_detector.add_embedding(embedding)
+            # Format text with speaker label
+            speaker_label = SPEAKER_LABELS[speaker_id]
+            formatted_text = f"{speaker_label}: {text}"
+            return speaker_id, formatted_text
         except Exception as e:
+            print(f"Error processing audio segment: {e}")
+            return 0, f"Speaker 1: {text}"
+    def get_transcript_history(self):
+        """Get the formatted transcript history"""
+        return "\n".join(self.transcript_history)
+    def add_to_transcript(self, formatted_text: str):
+        """Add formatted text to transcript history"""
+        self.transcript_history.append(formatted_text)
+        # Keep only last 50 entries to prevent memory issues
+        if len(self.transcript_history) > 50:
+            self.transcript_history = self.transcript_history[-50:]
+    def clear_transcript(self):
+        """Clear transcript history and reset speaker detector"""
+        self.transcript_history = []
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 embedding_dim=self.encoder.embedding_dim,
                 change_threshold=self.change_threshold,
                 max_speakers=self.max_speakers
             )
 # Global instance
+diarization_system = RealTimeSpeakerDiarization()
+async def initialize_system():
     """Initialize the diarization system"""
+    success = await diarization_system.initialize()
     if success:
+        return "✅ Speaker diarization system initialized successfully!"
     else:
+        return "❌ Failed to initialize speaker diarization system. Please check your setup."
+def process_audio_with_transcript(audio_data, sample_rate, transcription_text, change_threshold, max_speakers):
+    """Process audio with transcription for speaker diarization"""
+    if not diarization_system.is_initialized:
+        return "Please initialize the system first.", ""
+    if audio_data is None or transcription_text.strip() == "":
+        return diarization_system.get_transcript_history(), ""
+    try:
+        # Update settings
+        diarization_system.update_settings(change_threshold, max_speakers)
+        # Convert audio to the right format
+        if len(audio_data.shape) > 1:
+            audio_data = audio_data.mean(axis=1)  # Convert to mono
+        # Resample if needed
+        if sample_rate != SAMPLE_RATE:
+            audio_data = torchaudio.functional.resample(
+                torch.tensor(audio_data), sample_rate, SAMPLE_RATE
+            ).numpy()
+        # Process the audio segment
+        speaker_id, formatted_text = diarization_system.process_audio_segment(audio_data, transcription_text)
+        # Add to transcript
+        diarization_system.add_to_transcript(formatted_text)
+        # Return updated transcript and current speaker info
+        transcript = diarization_system.get_transcript_history()
+        current_speaker_info = f"Current Speaker: {SPEAKER_LABELS[speaker_id]}"
+        return transcript, current_speaker_info
+    except Exception as e:
+        error_msg = f"Error processing audio: {str(e)}"
+        return diarization_system.get_transcript_history(), error_msg
 def clear_conversation():
+    """Clear the conversation transcript"""
+    diarization_system.clear_transcript()
+    return "", "Conversation cleared."
+def create_gradio_interface():
+    """Create and return the Gradio interface"""
+    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎙️ Real-time Speaker Diarization with ASR")
+        gr.Markdown("Upload audio with transcription to perform real-time speaker diarization.")
+        # Initialization section
         with gr.Row():
+            init_btn = gr.Button("🚀 Initialize System", variant="primary")
+            init_status = gr.Textbox(label="Initialization Status", interactive=False)
+        # Settings section
+        with gr.Row():
+            with gr.Column():
+                change_threshold = gr.Slider(
+                    minimum=0.1,
+                    maximum=0.9,
                     value=DEFAULT_CHANGE_THRESHOLD,
+                    step=0.05,
+                    label="Speaker Change Threshold",
+                    info="Lower values = more sensitive to speaker changes"
                 )
+            with gr.Column():
+                max_speakers = gr.Slider(
                     minimum=2,
                     maximum=ABSOLUTE_MAX_SPEAKERS,
                     value=DEFAULT_MAX_SPEAKERS,
+                    step=1,
+                    label="Maximum Number of Speakers",
+                    info="Maximum number of speakers to detect"
                 )
+        # Audio input and transcription
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(
+                    label="Audio Input",
+                    type="numpy",
+                    format="wav"
                 )
+                transcription_input = gr.Textbox(
+                    label="Transcription Text",
+                    placeholder="Enter the transcription of the audio...",
+                    lines=3
                 )
+                process_btn = gr.Button("🎯 Process Audio", variant="secondary")
+            with gr.Column():
+                current_speaker = gr.Textbox(
+                    label="Current Speaker",
+                    interactive=False
+                )
+                clear_btn = gr.Button("🗑️ Clear Conversation", variant="stop")
+        # Output section
+        transcript_output = gr.Textbox(
+            label="Live Transcript with Speaker Labels",
+            lines=15,
+            max_lines=20,
+            interactive=False,
+            placeholder="Processed transcript will appear here..."
         )
+        # Event handlers
+        init_btn.click(
+            fn=initialize_system,
+            outputs=[init_status]
         )
+        process_btn.click(
+            fn=process_audio_with_transcript,
+            inputs=[
+                audio_input,
+                gr.Number(value=SAMPLE_RATE, visible=False),  # Hidden sample rate
+                transcription_input,
+                change_threshold,
+                max_speakers
+            ],
+            outputs=[transcript_output, current_speaker]
         )
         clear_btn.click(
+            fn=clear_conversation,
+            outputs=[transcript_output, current_speaker]
         )
+        # Auto-process when audio and transcription are provided
+        audio_input.change(
+            fn=process_audio_with_transcript,
+            inputs=[
+                audio_input,
+                gr.Number(value=SAMPLE_RATE, visible=False),
+                transcription_input,
+                change_threshold,
+                max_speakers
+            ],
+            outputs=[transcript_output, current_speaker]
         )
+        # Instructions
+        gr.Markdown("""
+        ## Instructions:
+        1. **Initialize**: Click "Initialize System" to load the speaker diarization models
+        2. **Upload Audio**: Upload an audio file (WAV format recommended)
+        3. **Add Transcription**: Enter the transcription text for the audio
+        4. **Adjust Settings**:
+           - **Speaker Change Threshold**: Lower values detect speaker changes more easily
+           - **Max Speakers**: Set the maximum number of speakers you expect
+        5. **Process**: Click "Process Audio" or the system will auto-process
+        6. **View Results**: See the transcript with speaker labels (Speaker 1, Speaker 2, etc.)
+        ## Tips:
+        - For similar-sounding speakers, increase the threshold (0.6-0.8)
+        - For different-sounding speakers, lower threshold works better (0.3-0.5)
+        - The system maintains speaker consistency across the conversation
+        - Use "Clear Conversation" to reset the speaker memory
+        """)
+    return demo
 if __name__ == "__main__":
+    # Create and launch the Gradio interface
+    demo = create_gradio_interface()
+    demo.launch(
+        share=True,
         server_name="0.0.0.0",
         server_port=7860,
+        show_error=True
     )