Spaces:

ceymox
/

TTS-Live_conversation_engine-AP

Running

App Files Files Community

ceymox commited on Apr 30

Commit

083b359

verified ·

1 Parent(s): a387769

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -483

app.py CHANGED Viewed

@@ -902,394 +902,11 @@ class ConversationEngine:
 conversation_engine = ConversationEngine()
 speech_recognizer = SpeechRecognizer()
-class ConversationEngine:
-    def __init__(self):
-        self.conversation_history = []
-        self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
-        self.saved_voice = None
-        self.saved_voice_text = ""
-        self.tts_cache = {}  # Cache for TTS outputs
-        # TTS background processing queue
-        self.tts_queue = queue.Queue()
-        self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
-        self.tts_thread.start()
-        # Initialize IndicF5 TTS model if available
-        self.tts_model = None
-        self.device = None
-        try:
-            self.initialize_tts_model()
-            # Test the model if it was loaded successfully
-            if self.tts_model is not None:
-                print("TTS model initialized successfully")
-        except Exception as e:
-            print(f"Error initializing TTS model: {e}")
-            traceback.print_exc()
-    def initialize_tts_model(self):
-        """Initialize the IndicF5 TTS model with optimizations"""
-        try:
-            # Check for HF token in environment and use it if available
-            hf_token = os.getenv("HF_TOKEN")
-            if hf_token:
-                print("Logging into Hugging Face with the provided token.")
-                login(token=hf_token)
-            if torch.cuda.is_available():
-                self.device = torch.device("cuda")
-                print(f"Using GPU: {torch.cuda.get_device_name(0)}")
-            else:
-                self.device = torch.device("cpu")
-                print("Using CPU")
-            # Enable performance optimizations
-            torch.backends.cudnn.benchmark = True
-            # Load TTS model and move it to the appropriate device (GPU/CPU)
-            print("Loading TTS model from ai4bharat/IndicF5...")
-            repo_id = "ai4bharat/IndicF5"
-            self.tts_model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
-            self.tts_model = self.tts_model.to(self.device)
-            # Set model to evaluation mode for faster inference
-            self.tts_model.eval()
-            print("TTS model loaded successfully")
-        except Exception as e:
-            print(f"Failed to load TTS model: {e}")
-            self.tts_model = None
-            traceback.print_exc()
-    def tts_worker(self):
-        """Background worker to process TTS requests"""
-        while True:
-            try:
-                # Get text and callback from queue
-                text, callback = self.tts_queue.get()
-                # Generate speech
-                audio_path = self._generate_tts(text)
-                # Execute callback with result
-                if callback:
-                    callback(audio_path)
-                # Mark task as done
-                self.tts_queue.task_done()
-            except Exception as e:
-                print(f"Error in TTS worker: {e}")
-                traceback.print_exc()
-    def transcribe_audio(self, audio_data, language="ml-IN"):
-        """Convert audio to text using speech recognition"""
-        if audio_data is None:
-            print("No audio data received")
-            return "No audio detected", ""
-        # Make sure we have audio data in the expected format
-        try:
-            if isinstance(audio_data, tuple) and len(audio_data) == 2:
-                # Expected format: (sample_rate, audio_samples)
-                sample_rate, audio_samples = audio_data
-            else:
-                print(f"Unexpected audio format: {type(audio_data)}")
-                return "Invalid audio format", ""
-            if len(audio_samples) == 0:
-                print("Empty audio samples")
-                return "No speech detected", ""
-            # Save the audio temporarily
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-            temp_file.close()
-            # Save the audio data to the temporary file
-            sf.write(temp_file.name, audio_samples, sample_rate)
-            # Use speech recognition on the file
-            recognizer = sr.Recognizer()
-            with sr.AudioFile(temp_file.name) as source:
-                audio = recognizer.record(source)
-            text = recognizer.recognize_google(audio, language=language)
-            print(f"Recognized: {text}")
-            return text, text
-        except sr.UnknownValueError:
-            print("Speech recognition could not understand audio")
-            return "Could not understand audio", ""
-        except sr.RequestError as e:
-            print(f"Could not request results from Google Speech Recognition service: {e}")
-            return f"Speech recognition service error: {str(e)}", ""
-        except Exception as e:
-            print(f"Error processing audio: {e}")
-            traceback.print_exc()
-            return f"Error processing audio: {str(e)}", ""
-        finally:
-            # Clean up temporary file
-            if 'temp_file' in locals() and os.path.exists(temp_file.name):
-                try:
-                    os.unlink(temp_file.name)
-                except Exception as e:
-                    print(f"Error deleting temporary file: {e}")
-    def save_reference_voice(self, audio_data, reference_text):
-        """Save the reference voice for future TTS generation"""
-        if audio_data is None or not reference_text.strip():
-            return "Error: Both reference audio and text are required"
-        self.saved_voice = audio_data
-        self.saved_voice_text = reference_text.strip()
-        # Clear TTS cache when voice changes
-        self.tts_cache.clear()
-        # Debug info
-        sample_rate, audio_samples = audio_data
-        print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
-        print(f"Reference text: {reference_text}")
-        return f"Voice saved successfully! Reference text: {reference_text}"
-    def process_text_input(self, text):
-        """Process text input from user"""
-        if text and text.strip():
-            return text, text
-        return "No input provided", ""
-    def generate_response(self, input_text):
-        """Generate AI response using GPT-3.5 Turbo"""
-        if not input_text or not input_text.strip():
-            return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None  # "No input received. Please try again."
-        try:
-            # Prepare conversation context from history
-            messages = [{"role": "system", "content": self.system_prompt}]
-            # Add previous conversations for context
-            for entry in self.conversation_history:
-                role = "user" if entry["role"] == "user" else "assistant"
-                messages.append({"role": role, "content": entry["content"]})
-            # Add current input
-            messages.append({"role": "user", "content": input_text})
-            # Call OpenAI API
-            response = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=messages,
-                max_tokens=500,
-                temperature=0.7
-            )
-            response_text = response.choices[0].message.content.strip()
-            return response_text, None
-        except Exception as e:
-            error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
-            print(f"Error in GPT response: {e}")
-            traceback.print_exc()
-            return error_msg, None
-    def resample_audio(self, audio, orig_sr, target_sr):
-        """Resample audio to match target sample rate only if necessary"""
-        if orig_sr != target_sr:
-            print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
-            return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
-        return audio
-    def _generate_tts(self, text):
-        """Internal method to generate TTS without threading"""
-        if not text or not text.strip():
-            print("No text provided for TTS generation")
-            return None
-        # Check cache first
-        if text in self.tts_cache:
-            print("Using cached TTS output")
-            return self.tts_cache[text]
-        try:
-            # Check if we have a saved voice and the TTS model
-            if self.saved_voice is not None and self.tts_model is not None:
-                sample_rate, audio_data = self.saved_voice
-                # Create a temporary file for the reference audio
-                ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                ref_temp_file.close()
-                print(f"Saving reference audio to {ref_temp_file.name}")
-                # Save the reference audio data
-                sf.write(ref_temp_file.name, audio_data, sample_rate)
-                # Create a temporary file for the output audio
-                output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                output_temp_file.close()
-                try:
-                    # Generate speech using IndicF5 - simplified approach from second file
-                    print(f"Generating speech with IndicF5. Text: {text[:30]}...")
-                    start_time = time.time()
-                    # Use torch.no_grad() to save memory and computation
-                    with torch.no_grad():
-                        # Run the inference - directly use the model as in the second file
-                        synth_audio = self.tts_model(
-                            text,
-                            ref_audio_path=ref_temp_file.name,
-                            ref_text=self.saved_voice_text
-                        )
-                    end_time = time.time()
-                    print(f"Speech generation completed in {(end_time - start_time)} seconds")
-                    # Normalize output if needed
-                    if synth_audio.dtype == np.int16:
-                        synth_audio = synth_audio.astype(np.float32) / 32768.0
-                    # Resample the generated audio to match the reference audio's sample rate
-                    synth_audio = self.resample_audio(synth_audio, orig_sr=24000, target_sr=sample_rate)
-                    # Save the synthesized audio
-                    print(f"Saving synthesized audio to {output_temp_file.name}")
-                    sf.write(output_temp_file.name, synth_audio, sample_rate)
-                    # Cache the result
-                    self.tts_cache[text] = output_temp_file.name
-                    print(f"TTS generation successful, output file: {output_temp_file.name}")
-                    return output_temp_file.name
-                except Exception as e:
-                    print(f"IndicF5 TTS failed with error: {e}")
-                    traceback.print_exc()
-                    # Fall back to Google TTS
-                    return self.fallback_tts(text, output_temp_file.name)
-                finally:
-                    # Clean up reference audio file
-                    if os.path.exists(ref_temp_file.name):
-                        try:
-                            os.unlink(ref_temp_file.name)
-                        except Exception as e:
-                            print(f"Error deleting temporary file: {e}")
-            else:
-                if self.saved_voice is None:
-                    print("No saved voice available for TTS")
-                if self.tts_model is None:
-                    print("TTS model not initialized")
-                # No saved voice or TTS model, use fallback
-                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                temp_file.close()
-                return self.fallback_tts(text, temp_file.name)
-        except Exception as e:
-            print(f"Error in TTS processing: {e}")
-            traceback.print_exc()
-            return None
-    def speak_with_indicf5(self, text, callback=None):
-        """Queue text for TTS generation"""
-        if not text or not text.strip():
-            if callback:
-                callback(None)
-            return None
-        # Check cache first for immediate response
-        if text in self.tts_cache:
-            print("Using cached TTS output")
-            if callback:
-                callback(self.tts_cache[text])
-            return self.tts_cache[text]
-        # If no callback provided, generate synchronously
-        if callback is None:
-            return self._generate_tts(text)
-        # Otherwise, queue for async processing
-        self.tts_queue.put((text, callback))
-        return None
-    def fallback_tts(self, text, output_path):
-        """Fallback to Google TTS if IndicF5 fails"""
-        try:
-            from gtts import gTTS
-            # Determine if text is Malayalam
-            is_malayalam = any('\u0D00' <= c <= '\u0D7F' for c in text)
-            lang = 'ml' if is_malayalam else 'en'
-            print(f"Using fallback Google TTS with language: {lang}")
-            tts = gTTS(text=text, lang=lang, slow=False)
-            tts.save(output_path)
-            # Cache the result
-            self.tts_cache[text] = output_path
-            print(f"Fallback TTS saved to: {output_path}")
-            return output_path
-        except Exception as e:
-            print(f"Fallback TTS also failed: {e}")
-            traceback.print_exc()
-            return None
-    def add_message(self, role, content):
-        """Add a message to the conversation history"""
-        timestamp = datetime.now().strftime("%H:%M:%S")
-        self.conversation_history.append({
-            "role": role,
-            "content": content,
-            "timestamp": timestamp
-        })
-    def clear_conversation(self):
-        """Clear the conversation history"""
-        self.conversation_history = []
-    def cleanup(self):
-        """Clean up resources when shutting down"""
-        print("Cleaning up resources...")
-# Load example Malayalam voices
-def load_audio_from_url(url):
-    """Load audio from a URL"""
-    try:
-        response = requests.get(url)
-        if response.status_code == 200:
-            audio_data, sample_rate = sf.read(io.BytesIO(response.content))
-            return sample_rate, audio_data
-    except Exception as e:
-        print(f"Error loading audio from URL: {e}")
-    return None, None
-# Malayalam voice examples
-EXAMPLE_VOICES = [
-    {
-        "name": "Aparna Voice",
-        "url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/Aparna%20Voice.wav",
-        "transcript": "ഞാൻ ഒരു ഫോണിന്‍റെ കവർ നോക്കുകയാണ്. എനിക്ക് സ്മാർട്ട് ഫോണിന് കവർ വേണം"
-    },
-    {
-        "name": "KC Voice",
-        "url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",
-        "transcript": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ"
-    }
-]
-# Preload example voices
-for voice in EXAMPLE_VOICES:
-    sample_rate, audio_data = load_audio_from_url(voice["url"])
-    if sample_rate is not None:
-        voice["audio"] = (sample_rate, audio_data)
-        print(f"Loaded example voice: {voice['name']}")
-    else:
-        print(f"Failed to load voice: {voice['name']}")
 def create_chatbot_interface():
     """Create a single-page chatbot interface with voice input, output, and voice selection"""
-    # Initialize conversation engine
-    engine = ConversationEngine()
     # CSS for styling the chat interface
     css = """
@@ -1297,7 +914,7 @@ def create_chatbot_interface():
         display: flex;
         flex-direction: column;
         height: 100%;
-        max-width: 800px;
         margin: 0 auto;
     }
     .chat-window {
@@ -1307,7 +924,7 @@ def create_chatbot_interface():
         background: #f5f7f9;
         border-radius: 0.5rem;
         margin-bottom: 1rem;
-        min-height: 400px;
     }
     .input-area {
         display: flex;
@@ -1316,11 +933,12 @@ def create_chatbot_interface():
         align-items: center;
     }
     .message {
-        margin-bottom: 1rem;
-        padding: 0.8rem;
         border-radius: 0.5rem;
         position: relative;
         max-width: 80%;
     }
     .user-message {
         background: #e1f5fe;
@@ -1341,10 +959,11 @@ def create_chatbot_interface():
         text-align: center;
         color: #333;
         margin-bottom: 1rem;
     }
     .chat-controls {
         display: flex;
-        justify-content: space-between;
         margin-bottom: 0.5rem;
     }
     .voice-selector {
@@ -1353,46 +972,64 @@ def create_chatbot_interface():
         border-radius: 0.5rem;
         margin-bottom: 1rem;
     }
-    .progress-bar {
-        height: 4px;
-        background-color: #e0e0e0;
-        position: relative;
-        margin: 10px 0;
-        border-radius: 2px;
     }
-    .progress-bar-fill {
-        height: 100%;
-        background-color: #4CAF50;
-        border-radius: 2px;
-        transition: width 0.3s ease-in-out;
     }
     """
     with gr.Blocks(css=css, title="Malayalam Voice Chatbot") as interface:
-        gr.Markdown("# 🤖 Malayalam Voice Chatbot with Voice Selection", elem_classes=["chatbot-header"])
-        # Create a state variable for TTS progress
         tts_progress_state = gr.State(0)
         audio_output_state = gr.State(None)
         with gr.Row(elem_classes=["chatbot-container"]):
             with gr.Column():
-                # Voice selection section - fixed to use Accordion instead of Box
                 with gr.Accordion("🎤 Voice Selection", open=True):
-                    # Select from example voices or record your own
-                    voice_selector = gr.Dropdown(
-                        choices=[voice["name"] for voice in EXAMPLE_VOICES],
-                        value=EXAMPLE_VOICES[0]["name"] if EXAMPLE_VOICES else None,
-                        label="Select Voice Example"
-                    )
-                    # Display selected voice info
-                    voice_info = gr.Textbox(
-                        value=EXAMPLE_VOICES[0]["transcript"] if EXAMPLE_VOICES else "",
-                        label="Voice Sample Transcript",
-                        lines=2,
-                        interactive=True
-                    )
                     # Play selected example voice
                     example_audio = gr.Audio(
@@ -1401,33 +1038,40 @@ def create_chatbot_interface():
                         interactive=False
                     )
-                    # Or record your own voice
-                    gr.Markdown("### OR Record Your Own Voice")
-                    custom_voice = gr.Audio(
-                        sources=["microphone", "upload"],
-                        type="numpy",
-                        label="Record/Upload Your Voice"
-                    )
-                    custom_transcript = gr.Textbox(
-                        value="",
-                        label="Your Voice Transcript (what you said in Malayalam)",
-                        lines=2
-                    )
                     # Button to save the selected/recorded voice
-                    save_voice_btn = gr.Button("💾 Save Voice for Chat", variant="primary")
-                    voice_status = gr.Textbox(label="Voice Status", value="No voice saved yet")
-                # Language selector and controls for chat
                 with gr.Row(elem_classes=["chat-controls"]):
                     language_selector = gr.Dropdown(
                         choices=["ml-IN", "en-US", "hi-IN", "ta-IN", "te-IN", "kn-IN"],
                         value="ml-IN",
-                        label="Speech Recognition Language"
                     )
-                    clear_btn = gr.Button("🧹 Clear Chat", scale=0)
                 # Chat display area
                 chatbot = gr.Chatbot(
@@ -1438,54 +1082,61 @@ def create_chatbot_interface():
                     elem_classes=["chat-window"]
                 )
-                # Progress bar for TTS generation
-                with gr.Row():
-                    tts_progress = gr.Slider(
-                        minimum=0,
-                        maximum=100,
-                        value=0,
-                        label="TTS Progress",
-                        interactive=False
-                    )
                 # Audio output for the bot's response
                 audio_output = gr.Audio(
                     label="Bot's Voice Response",
                     type="filepath",
                     autoplay=True,
-                    visible=True
                 )
-                # Status message for debugging
                 status_msg = gr.Textbox(
                     label="Status",
                     value="Ready",
-                    interactive=False
                 )
                 # Input area with separate components
                 with gr.Row(elem_classes=["input-area"]):
-                    audio_msg = gr.Textbox(
-                        label="Message",
-                        placeholder="Type a message or record audio",
-                        lines=1
-                    )
-                    audio_input = gr.Audio(
-                        sources=["microphone"],
-                        type="numpy",
-                        label="Record",
-                        elem_classes=["audio-input"]
-                    )
-                    submit_btn = gr.Button("🚀 Send", variant="primary")
-        # Function to update voice example info
         def update_voice_example(voice_name):
             for voice in EXAMPLE_VOICES:
                 if voice["name"] == voice_name and "audio" in voice:
                     return voice["transcript"], voice["audio"]
             return "", None
-        # Function to save voice for TTS
         def save_voice_for_tts(example_name, example_audio, custom_audio, example_transcript, custom_transcript):
             try:
                 # Check if we're using an example voice or custom recorded voice
@@ -1506,7 +1157,7 @@ def create_chatbot_interface():
                     return "Error: No voice selected or recorded"
                 # Save the voice in the engine
-                result = engine.save_reference_voice(voice_audio, transcript)
                 return f"Voice saved successfully! Using {source}"
             except Exception as e:
@@ -1514,16 +1165,16 @@ def create_chatbot_interface():
                 traceback.print_exc()
                 return f"Error saving voice: {str(e)}"
-        # Function to update TTS progress
         def update_tts_progress(progress):
             return progress
-        # Audio generated callback
         def on_tts_generated(audio_path):
             print(f"TTS generation callback received path: {audio_path}")
             return audio_path, 100, "Response ready"  # audio path, 100% progress, status message
-        # Function to process user input and generate response
         def process_input(audio, text_input, history, language, progress):
             try:
                 # Update status
@@ -1535,7 +1186,7 @@ def create_chatbot_interface():
                 # Check which input mode we're using
                 if audio is not None:
                     # Audio input
-                    transcribed_text, input_text = engine.transcribe_audio(audio, language)
                     if not input_text:
                         status = "Could not understand audio. Please try again."
                         return history, None, status, text_input, progress
@@ -1549,7 +1200,7 @@ def create_chatbot_interface():
                     return history, None, status, text_input, progress
                 # Add user message to conversation history
-                engine.add_message("user", input_text)
                 # Update the Gradio chatbot display immediately with user message
                 updated_history = history + [[transcribed_text, None]]
@@ -1559,10 +1210,10 @@ def create_chatbot_interface():
                 progress = 30
                 # Generate response
-                response_text, _ = engine.generate_response(input_text)
                 # Add assistant response to conversation history
-                engine.add_message("assistant", response_text)
                 # Update the Gradio chatbot with the assistant's response
                 updated_history = history + [[transcribed_text, response_text]]
@@ -1572,7 +1223,7 @@ def create_chatbot_interface():
                 progress = 60
                 # Generate speech for response synchronously (for better debugging)
-                audio_path = engine._generate_tts(response_text)
                 if audio_path:
                     status = f"Response ready: {audio_path}"
@@ -1591,9 +1242,9 @@ def create_chatbot_interface():
                 traceback.print_exc()
                 return history, None, error_message, text_input, progress
-        # Function to clear chat history
         def clear_chat():
-            engine.clear_conversation()
             return [], None, "Chat history cleared", "", 0
         # Connect event handlers
@@ -1635,7 +1286,7 @@ def create_chatbot_interface():
         # Setup cleanup on exit
         def exit_handler():
-            engine.cleanup()
         import atexit
         atexit.register(exit_handler)
@@ -1643,10 +1294,4 @@ def create_chatbot_interface():
         # Enable queueing for better responsiveness
         interface.queue()
-    return interface
-# Start the interface
-if __name__ == "__main__":
-    print("Starting Malayalam Voice Chatbot with IndicF5 Voice Selection...")
-    interface = create_chatbot_interface()
-    interface.launch(debug=True)  # Enable debug mode to see errors in the console

 conversation_engine = ConversationEngine()
 speech_recognizer = SpeechRecognizer()
 def create_chatbot_interface():
     """Create a single-page chatbot interface with voice input, output, and voice selection"""
+    # Use global conversation engine
+    global conversation_engine, speech_recognizer
     # CSS for styling the chat interface
     css = """
         display: flex;
         flex-direction: column;
         height: 100%;
+        max-width: 1000px;
         margin: 0 auto;
     }
     .chat-window {
         background: #f5f7f9;
         border-radius: 0.5rem;
         margin-bottom: 1rem;
+        min-height: 450px;
     }
     .input-area {
         display: flex;
         align-items: center;
     }
     .message {
+        margin-bottom: 0.8rem;
+        padding: 0.7rem;
         border-radius: 0.5rem;
         position: relative;
         max-width: 80%;
+        font-size: 0.95rem;
     }
     .user-message {
         background: #e1f5fe;
         text-align: center;
         color: #333;
         margin-bottom: 1rem;
+        font-size: 1.8rem;
     }
     .chat-controls {
         display: flex;
+        justify-content: flex-end;
         margin-bottom: 0.5rem;
     }
     .voice-selector {
         border-radius: 0.5rem;
         margin-bottom: 1rem;
     }
+    button.primary {
+        background-color: #4f46e5;
+        color: white;
+        padding: 0.6rem 1.2rem;
+        border-radius: 0.375rem;
+        font-weight: 500;
     }
+    button.secondary {
+        background-color: #e5e7eb;
+        color: #374151;
+        padding: 0.6rem 1.2rem;
+        border-radius: 0.375rem;
+        font-weight: 500;
+    }
+    .audio-player {
+        margin-top: 0.5rem;
+        margin-bottom: 1rem;
+    }
+    /* Customizing Gradio's default elements */
+    .gradio-container {
+        max-width: 1000px !important;
+    }
+    .message-bubble {
+        font-size: 0.95rem !important;
+    }
+    .message-wrap {
+        margin-bottom: 8px !important;
     }
     """
     with gr.Blocks(css=css, title="Malayalam Voice Chatbot") as interface:
+        gr.Markdown("# 🤖 Malayalam Voice Chatbot", elem_classes=["chatbot-header"])
+        # Create a state variable for TTS progress (hidden but needed for functionality)
         tts_progress_state = gr.State(0)
         audio_output_state = gr.State(None)
         with gr.Row(elem_classes=["chatbot-container"]):
             with gr.Column():
+                # Voice selection section
                 with gr.Accordion("🎤 Voice Selection", open=True):
+                    with gr.Row():
+                        # Select from example voices
+                        with gr.Column(scale=1):
+                            voice_selector = gr.Dropdown(
+                                choices=[voice["name"] for voice in EXAMPLE_VOICES],
+                                value=EXAMPLE_VOICES[0]["name"] if EXAMPLE_VOICES else None,
+                                label="Select Example Voice"
+                            )
+                        # Display selected voice info
+                        with gr.Column(scale=2):
+                            voice_info = gr.Textbox(
+                                value=EXAMPLE_VOICES[0]["transcript"] if EXAMPLE_VOICES else "",
+                                label="Voice Sample Transcript",
+                                lines=2,
+                                interactive=True
+                            )
                     # Play selected example voice
                     example_audio = gr.Audio(
                         interactive=False
                     )
+                    gr.Markdown("### 🎙️ Record Your Own Voice")
+                    with gr.Row():
+                        # Or record your own voice
+                        with gr.Column(scale=1):
+                            custom_voice = gr.Audio(
+                                sources=["microphone", "upload"],
+                                type="numpy",
+                                label="Record/Upload Voice"
+                            )
+                        # Transcript for custom voice
+                        with gr.Column(scale=2):
+                            custom_transcript = gr.Textbox(
+                                value="",
+                                label="Your Voice Transcript (what you said in Malayalam)",
+                                lines=2
+                            )
                     # Button to save the selected/recorded voice
+                    with gr.Row():
+                        save_voice_btn = gr.Button("💾 Save Voice for Chat", variant="primary")
+                        voice_status = gr.Textbox(label="Voice Status", value="No voice selected yet")
+                # Chat controls row (just the clear button)
                 with gr.Row(elem_classes=["chat-controls"]):
+                    # Hidden language selector (kept for functionality)
                     language_selector = gr.Dropdown(
                         choices=["ml-IN", "en-US", "hi-IN", "ta-IN", "te-IN", "kn-IN"],
                         value="ml-IN",
+                        label="Speech Recognition Language",
+                        visible=False
                     )
+                    clear_btn = gr.Button("🧹 Clear Chat", variant="secondary")
                 # Chat display area
                 chatbot = gr.Chatbot(
                     elem_classes=["chat-window"]
                 )
+                # Hidden progress bar (kept for functionality)
+                tts_progress = gr.Slider(
+                    minimum=0,
+                    maximum=100,
+                    value=0,
+                    label="TTS Progress",
+                    interactive=False,
+                    visible=False
+                )
                 # Audio output for the bot's response
                 audio_output = gr.Audio(
                     label="Bot's Voice Response",
                     type="filepath",
                     autoplay=True,
+                    visible=True,
+                    elem_classes=["audio-player"]
                 )
+                # Hidden status message (kept for functionality)
                 status_msg = gr.Textbox(
                     label="Status",
                     value="Ready",
+                    interactive=False,
+                    visible=False
                 )
                 # Input area with separate components
                 with gr.Row(elem_classes=["input-area"]):
+                    with gr.Column(scale=4):
+                        audio_msg = gr.Textbox(
+                            placeholder="Type a message in Malayalam...",
+                            lines=1,
+                            label=None,
+                            show_label=False
+                        )
+                    with gr.Column(scale=1):
+                        with gr.Row():
+                            audio_input = gr.Audio(
+                                sources=["microphone"],
+                                type="numpy",
+                                label=None,
+                                show_label=False,
+                                elem_classes=["audio-input"]
+                            )
+                            submit_btn = gr.Button("🚀 Send", variant="primary")
+        # Function to update voice example info (unchanged)
         def update_voice_example(voice_name):
             for voice in EXAMPLE_VOICES:
                 if voice["name"] == voice_name and "audio" in voice:
                     return voice["transcript"], voice["audio"]
             return "", None
+        # Function to save voice for TTS (unchanged)
         def save_voice_for_tts(example_name, example_audio, custom_audio, example_transcript, custom_transcript):
             try:
                 # Check if we're using an example voice or custom recorded voice
                     return "Error: No voice selected or recorded"
                 # Save the voice in the engine
+                result = conversation_engine.save_reference_voice(voice_audio, transcript)
                 return f"Voice saved successfully! Using {source}"
             except Exception as e:
                 traceback.print_exc()
                 return f"Error saving voice: {str(e)}"
+        # Function to update TTS progress (unchanged)
         def update_tts_progress(progress):
             return progress
+        # Audio generated callback (unchanged)
         def on_tts_generated(audio_path):
             print(f"TTS generation callback received path: {audio_path}")
             return audio_path, 100, "Response ready"  # audio path, 100% progress, status message
+        # Function to process user input and generate response (updated to use global engine)
         def process_input(audio, text_input, history, language, progress):
             try:
                 # Update status
                 # Check which input mode we're using
                 if audio is not None:
                     # Audio input
+                    transcribed_text, input_text = speech_recognizer.transcribe_audio(audio, language)
                     if not input_text:
                         status = "Could not understand audio. Please try again."
                         return history, None, status, text_input, progress
                     return history, None, status, text_input, progress
                 # Add user message to conversation history
+                conversation_engine.add_message("user", input_text)
                 # Update the Gradio chatbot display immediately with user message
                 updated_history = history + [[transcribed_text, None]]
                 progress = 30
                 # Generate response
+                response_text, _ = conversation_engine.generate_response(input_text)
                 # Add assistant response to conversation history
+                conversation_engine.add_message("assistant", response_text)
                 # Update the Gradio chatbot with the assistant's response
                 updated_history = history + [[transcribed_text, response_text]]
                 progress = 60
                 # Generate speech for response synchronously (for better debugging)
+                audio_path = conversation_engine._generate_tts(response_text)
                 if audio_path:
                     status = f"Response ready: {audio_path}"
                 traceback.print_exc()
                 return history, None, error_message, text_input, progress
+        # Function to clear chat history (updated to use global engine)
         def clear_chat():
+            conversation_engine.clear_conversation()
             return [], None, "Chat history cleared", "", 0
         # Connect event handlers
         # Setup cleanup on exit
         def exit_handler():
+            conversation_engine.cleanup()
         import atexit
         atexit.register(exit_handler)
         # Enable queueing for better responsiveness
         interface.queue()
+    return interface