Spaces:

ceymox
/

TTS-Live_conversation_engine-AP

Sleeping

App Files Files Community

ceymox commited on 23 days ago

Commit

b097cb6

verified ·

1 Parent(s): 8905882

Update app.py

Browse files

Files changed (1) hide show

app.py +267 -267

app.py CHANGED Viewed

@@ -628,275 +628,275 @@ class StreamingTTS:
                 return self.output_file
         return None
-class ConversationEngine:
-    def __init__(self):
-        self.conversation_history = []
-        self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
-        self.saved_voice = None
-        self.saved_voice_text = ""
-        self.tts_cache = {}  # Cache for TTS outputs
-        # TTS background processing queue
-        self.tts_queue = queue.Queue()
-        self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
-        self.tts_thread.start()
-        # Initialize streaming TTS
-        self.streaming_tts = StreamingTTS()
-    def tts_worker(self):
-        """Background worker to process TTS requests"""
-        while True:
-            try:
-                # Get text and callback from queue
-                text, callback = self.tts_queue.get()
-                # Generate speech
-                audio_path = self._generate_tts(text)
-                # Execute callback with result
-                if callback:
-                    callback(audio_path)
-                # Mark task as done
-                self.tts_queue.task_done()
-            except Exception as e:
-                print(f"Error in TTS worker: {e}")
-                traceback.print_exc()
-    def transcribe_audio(self, audio_data, language="ml-IN"):
-        """Convert audio to text using speech recognition"""
-        if audio_data is None:
-            print("No audio data received")
-            return "No audio detected", ""
-        # Make sure we have audio data in the expected format
-        try:
-            if isinstance(audio_data, tuple) and len(audio_data) == 2:
-                # Expected format: (sample_rate, audio_samples)
-                sample_rate, audio_samples = audio_data
-            else:
-                print(f"Unexpected audio format: {type(audio_data)}")
-                return "Invalid audio format", ""
-            if len(audio_samples) == 0:
-                print("Empty audio samples")
-                return "No speech detected", ""
-            # Save the audio temporarily
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-            temp_file.close()
-            # Save the audio data to the temporary file
-            sf.write(temp_file.name, audio_samples, sample_rate)
-            # Use speech recognition on the file
-            recognizer = sr.Recognizer()
-            with sr.AudioFile(temp_file.name) as source:
-                audio = recognizer.record(source)
-            text = recognizer.recognize_google(audio, language=language)
-            print(f"Recognized: {text}")
-            return text, text
-        except sr.UnknownValueError:
-            print("Speech recognition could not understand audio")
-            return "Could not understand audio", ""
-        except sr.RequestError as e:
-            print(f"Could not request results from Google Speech Recognition service: {e}")
-            return f"Speech recognition service error: {str(e)}", ""
-        except Exception as e:
-            print(f"Error processing audio: {e}")
-            traceback.print_exc()
-            return f"Error processing audio: {str(e)}", ""
-        finally:
-            # Clean up temporary file
-            if 'temp_file' in locals() and os.path.exists(temp_file.name):
-                try:
-                    os.unlink(temp_file.name)
-                except Exception as e:
-                    print(f"Error deleting temporary file: {e}")
-    def save_reference_voice(self, audio_data, reference_text):
-        """Save the reference voice for future TTS generation"""
-        if audio_data is None or not reference_text.strip():
-            return "Error: Both reference audio and text are required"
-        self.saved_voice = audio_data
-        self.saved_voice_text = reference_text.strip()
-        # Clear TTS cache when voice changes
-        self.tts_cache.clear()
-        # Debug info
-        sample_rate, audio_samples = audio_data
-        print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
-        print(f"Reference text: {reference_text}")
-        return f"Voice saved successfully! Reference text: {reference_text}"
-    def process_text_input(self, text):
-        """Process text input from user"""
-        if text and text.strip():
-            return text, text
-        return "No input provided", ""
-    def generate_response(self, input_text):
-        """Generate AI response using GPT-3.5 Turbo"""
-        if not input_text or not input_text.strip():
-            return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None  # "No input received. Please try again."
-        try:
-            # Prepare conversation context from history
-            messages = [{"role": "system", "content": self.system_prompt}]
-            # Add previous conversations for context
-            for entry in self.conversation_history:
-                role = "user" if entry["role"] == "user" else "assistant"
-                messages.append({"role": role, "content": entry["content"]})
-            # Add current input
-            messages.append({"role": "user", "content": input_text})
-            # Call OpenAI API
-            response = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=messages,
-                max_tokens=500,
-                temperature=0.7
-            )
-            response_text = response.choices[0].message["content"].strip()
-            return response_text, None
-        except Exception as e:
-            error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
-            print(f"Error in GPT response: {e}")
-            traceback.print_exc()
-            return error_msg, None
-    def resample_audio(self, audio, orig_sr, target_sr):
-        """Resample audio to match target sample rate only if necessary"""
-        if orig_sr != target_sr:
-            print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
-            return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
-        return audio
-    def _generate_tts(self, text):
-        """Internal method to generate TTS without threading"""
-        if not text or not text.strip():
-            print("No text provided for TTS generation")
-            return None
-        # Check cache first
-        if text in self.tts_cache:
-            print("Using cached TTS output")
-            return self.tts_cache[text]
-        try:
-            # Check if we have a saved voice and the TTS model
-            if self.saved_voice is not None and tts_model is not None:
-                sample_rate, audio_data = self.saved_voice
-                # Create a temporary file for the reference audio
-                ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                ref_temp_file.close()
-                print(f"Saving reference audio to {ref_temp_file.name}")
-                # Save the reference audio data
-                sf.write(ref_temp_file.name, audio_data, sample_rate)
-                # Create a temporary file for the output audio
-                output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                output_temp_file.close()
-                try:
-                    # Generate speech using IndicF5 - simplified approach from second file
-                    print(f"Generating speech with IndicF5. Text: {text[:30]}...")
-                    start_time = time.time()
-                    # Use torch.no_grad() to save memory and computation
-                    with torch.no_grad():
-                        # Run the inference using the wrapper
-                        synth_audio = tts_model_wrapper.generate(
-                            text,
-                            ref_audio_path=ref_temp_file.name,
-                            ref_text=self.saved_voice_text
-                        )
-                    end_time = time.time()
-                    print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
-                    # Process audio for better quality
-                    synth_audio = enhance_audio(synth_audio)
-                    # Save the synthesized audio
-                    sf.write(output_temp_file.name, synth_audio, 24000)  # IndicF5 uses 24kHz
-                    # Add to cache
-                    self.tts_cache[text] = output_temp_file.name
-                    print(f"TTS output saved to {output_temp_file.name}")
-                    return output_temp_file.name
-                except Exception as e:
-                    print(f"Error generating speech: {e}")
-                    traceback.print_exc()
-                    return None
-                finally:
-                    # We don't delete the output file as it's returned to the caller
-                    # But clean up reference file
-                    try:
-                        os.unlink(ref_temp_file.name)
-                    except Exception as e:
-                        print(f"Error cleaning up reference file: {e}")
-            else:
-                print("No saved voice reference or TTS model not loaded")
-                return None
-        except Exception as e:
-            print(f"Error in TTS processing: {e}")
-            traceback.print_exc()
-            return None
-    def queue_tts_generation(self, text, callback=None):
-        """Queue TTS generation in background thread"""
-        print(f"Queueing TTS generation for text: {text[:30]}...")
-        self.tts_queue.put((text, callback))
-    def generate_streamed_speech(self, text):
-        """Generate speech in a streaming manner for low latency"""
-        if not self.saved_voice:
-            print("No reference voice saved")
-            return None
-        if not text or not text.strip():
-            print("No text provided for streaming TTS")
-            return None
-        sample_rate, audio_data = self.saved_voice
-        # Start streaming generation
-        self.streaming_tts.generate(
-            text=text,
-            ref_audio=audio_data,
-            ref_sr=sample_rate,
-            ref_text=self.saved_voice_text
-        )
-        # Return the path that will be populated
-        return self.streaming_tts.output_file
-    def update_history(self, user_input, ai_response):
-        """Update conversation history"""
-        if user_input and user_input.strip():
-            self.conversation_history.append({"role": "user", "content": user_input})
-        if ai_response and ai_response.strip():
-            self.conversation_history.append({"role": "assistant", "content": ai_response})
-        # Limit history size
-        if len(self.conversation_history) > 20:
-            self.conversation_history = self.conversation_history[-20:]
 # Initialize global conversation engine
 conversation_engine = ConversationEngine()
@@ -1421,12 +1421,12 @@ def create_chatbot_interface():
                     voice_status = gr.Textbox(label="Voice Status", value="No voice saved yet")
                 # Language selector and controls for chat
-                with gr.Row(elem_classes=["chat-controls"]):
-                    language_selector = gr.Dropdown(
-                        choices=["ml-IN", "en-US", "hi-IN", "ta-IN", "te-IN", "kn-IN"],
-                        value="ml-IN",
-                        label="Speech Recognition Language"
-                    )
                     clear_btn = gr.Button("🧹 Clear Chat", scale=0)
                 # Chat display area
@@ -1438,15 +1438,15 @@ def create_chatbot_interface():
                     elem_classes=["chat-window"]
                 )
-                # Progress bar for TTS generation
-                with gr.Row():
-                    tts_progress = gr.Slider(
-                        minimum=0,
-                        maximum=100,
-                        value=0,
-                        label="TTS Progress",
-                        interactive=False
-                    )
                 # Audio output for the bot's response
                 audio_output = gr.Audio(
@@ -1456,12 +1456,12 @@ def create_chatbot_interface():
                     visible=True
                 )
-                # Status message for debugging
-                status_msg = gr.Textbox(
-                    label="Status",
-                    value="Ready",
-                    interactive=False
-                )
                 # Input area with separate components
                 with gr.Row(elem_classes=["input-area"]):

                 return self.output_file
         return None
+# class ConversationEngine:
+#     def __init__(self):
+#         self.conversation_history = []
+#         self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
+#         self.saved_voice = None
+#         self.saved_voice_text = ""
+#         self.tts_cache = {}  # Cache for TTS outputs
+#         # TTS background processing queue
+#         self.tts_queue = queue.Queue()
+#         self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
+#         self.tts_thread.start()
+#         # Initialize streaming TTS
+#         self.streaming_tts = StreamingTTS()
+#     def tts_worker(self):
+#         """Background worker to process TTS requests"""
+#         while True:
+#             try:
+#                 # Get text and callback from queue
+#                 text, callback = self.tts_queue.get()
+#                 # Generate speech
+#                 audio_path = self._generate_tts(text)
+#                 # Execute callback with result
+#                 if callback:
+#                     callback(audio_path)
+#                 # Mark task as done
+#                 self.tts_queue.task_done()
+#             except Exception as e:
+#                 print(f"Error in TTS worker: {e}")
+#                 traceback.print_exc()
+#     def transcribe_audio(self, audio_data, language="ml-IN"):
+#         """Convert audio to text using speech recognition"""
+#         if audio_data is None:
+#             print("No audio data received")
+#             return "No audio detected", ""
+#         # Make sure we have audio data in the expected format
+#         try:
+#             if isinstance(audio_data, tuple) and len(audio_data) == 2:
+#                 # Expected format: (sample_rate, audio_samples)
+#                 sample_rate, audio_samples = audio_data
+#             else:
+#                 print(f"Unexpected audio format: {type(audio_data)}")
+#                 return "Invalid audio format", ""
+#             if len(audio_samples) == 0:
+#                 print("Empty audio samples")
+#                 return "No speech detected", ""
+#             # Save the audio temporarily
+#             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+#             temp_file.close()
+#             # Save the audio data to the temporary file
+#             sf.write(temp_file.name, audio_samples, sample_rate)
+#             # Use speech recognition on the file
+#             recognizer = sr.Recognizer()
+#             with sr.AudioFile(temp_file.name) as source:
+#                 audio = recognizer.record(source)
+#             text = recognizer.recognize_google(audio, language=language)
+#             print(f"Recognized: {text}")
+#             return text, text
+#         except sr.UnknownValueError:
+#             print("Speech recognition could not understand audio")
+#             return "Could not understand audio", ""
+#         except sr.RequestError as e:
+#             print(f"Could not request results from Google Speech Recognition service: {e}")
+#             return f"Speech recognition service error: {str(e)}", ""
+#         except Exception as e:
+#             print(f"Error processing audio: {e}")
+#             traceback.print_exc()
+#             return f"Error processing audio: {str(e)}", ""
+#         finally:
+#             # Clean up temporary file
+#             if 'temp_file' in locals() and os.path.exists(temp_file.name):
+#                 try:
+#                     os.unlink(temp_file.name)
+#                 except Exception as e:
+#                     print(f"Error deleting temporary file: {e}")
+#     def save_reference_voice(self, audio_data, reference_text):
+#         """Save the reference voice for future TTS generation"""
+#         if audio_data is None or not reference_text.strip():
+#             return "Error: Both reference audio and text are required"
+#         self.saved_voice = audio_data
+#         self.saved_voice_text = reference_text.strip()
+#         # Clear TTS cache when voice changes
+#         self.tts_cache.clear()
+#         # Debug info
+#         sample_rate, audio_samples = audio_data
+#         print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
+#         print(f"Reference text: {reference_text}")
+#         return f"Voice saved successfully! Reference text: {reference_text}"
+#     def process_text_input(self, text):
+#         """Process text input from user"""
+#         if text and text.strip():
+#             return text, text
+#         return "No input provided", ""
+#     def generate_response(self, input_text):
+#         """Generate AI response using GPT-3.5 Turbo"""
+#         if not input_text or not input_text.strip():
+#             return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None  # "No input received. Please try again."
+#         try:
+#             # Prepare conversation context from history
+#             messages = [{"role": "system", "content": self.system_prompt}]
+#             # Add previous conversations for context
+#             for entry in self.conversation_history:
+#                 role = "user" if entry["role"] == "user" else "assistant"
+#                 messages.append({"role": role, "content": entry["content"]})
+#             # Add current input
+#             messages.append({"role": "user", "content": input_text})
+#             # Call OpenAI API
+#             response = openai.ChatCompletion.create(
+#                 model="gpt-3.5-turbo",
+#                 messages=messages,
+#                 max_tokens=500,
+#                 temperature=0.7
+#             )
+#             response_text = response.choices[0].message["content"].strip()
+#             return response_text, None
+#         except Exception as e:
+#             error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
+#             print(f"Error in GPT response: {e}")
+#             traceback.print_exc()
+#             return error_msg, None
+#     def resample_audio(self, audio, orig_sr, target_sr):
+#         """Resample audio to match target sample rate only if necessary"""
+#         if orig_sr != target_sr:
+#             print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
+#             return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+#         return audio
+#     def _generate_tts(self, text):
+#         """Internal method to generate TTS without threading"""
+#         if not text or not text.strip():
+#             print("No text provided for TTS generation")
+#             return None
+#         # Check cache first
+#         if text in self.tts_cache:
+#             print("Using cached TTS output")
+#             return self.tts_cache[text]
+#         try:
+#             # Check if we have a saved voice and the TTS model
+#             if self.saved_voice is not None and tts_model is not None:
+#                 sample_rate, audio_data = self.saved_voice
+#                 # Create a temporary file for the reference audio
+#                 ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+#                 ref_temp_file.close()
+#                 print(f"Saving reference audio to {ref_temp_file.name}")
+#                 # Save the reference audio data
+#                 sf.write(ref_temp_file.name, audio_data, sample_rate)
+#                 # Create a temporary file for the output audio
+#                 output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+#                 output_temp_file.close()
+#                 try:
+#                     # Generate speech using IndicF5 - simplified approach from second file
+#                     print(f"Generating speech with IndicF5. Text: {text[:30]}...")
+#                     start_time = time.time()
+#                     # Use torch.no_grad() to save memory and computation
+#                     with torch.no_grad():
+#                         # Run the inference using the wrapper
+#                         synth_audio = tts_model_wrapper.generate(
+#                             text,
+#                             ref_audio_path=ref_temp_file.name,
+#                             ref_text=self.saved_voice_text
+#                         )
+#                     end_time = time.time()
+#                     print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
+#                     # Process audio for better quality
+#                     synth_audio = enhance_audio(synth_audio)
+#                     # Save the synthesized audio
+#                     sf.write(output_temp_file.name, synth_audio, 24000)  # IndicF5 uses 24kHz
+#                     # Add to cache
+#                     self.tts_cache[text] = output_temp_file.name
+#                     print(f"TTS output saved to {output_temp_file.name}")
+#                     return output_temp_file.name
+#                 except Exception as e:
+#                     print(f"Error generating speech: {e}")
+#                     traceback.print_exc()
+#                     return None
+#                 finally:
+#                     # We don't delete the output file as it's returned to the caller
+#                     # But clean up reference file
+#                     try:
+#                         os.unlink(ref_temp_file.name)
+#                     except Exception as e:
+#                         print(f"Error cleaning up reference file: {e}")
+#             else:
+#                 print("No saved voice reference or TTS model not loaded")
+#                 return None
+#         except Exception as e:
+#             print(f"Error in TTS processing: {e}")
+#             traceback.print_exc()
+#             return None
+#     def queue_tts_generation(self, text, callback=None):
+#         """Queue TTS generation in background thread"""
+#         print(f"Queueing TTS generation for text: {text[:30]}...")
+#         self.tts_queue.put((text, callback))
+#     def generate_streamed_speech(self, text):
+#         """Generate speech in a streaming manner for low latency"""
+#         if not self.saved_voice:
+#             print("No reference voice saved")
+#             return None
+#         if not text or not text.strip():
+#             print("No text provided for streaming TTS")
+#             return None
+#         sample_rate, audio_data = self.saved_voice
+#         # Start streaming generation
+#         self.streaming_tts.generate(
+#             text=text,
+#             ref_audio=audio_data,
+#             ref_sr=sample_rate,
+#             ref_text=self.saved_voice_text
+#         )
+#         # Return the path that will be populated
+#         return self.streaming_tts.output_file
+#     def update_history(self, user_input, ai_response):
+#         """Update conversation history"""
+#         if user_input and user_input.strip():
+#             self.conversation_history.append({"role": "user", "content": user_input})
+#         if ai_response and ai_response.strip():
+#             self.conversation_history.append({"role": "assistant", "content": ai_response})
+#         # Limit history size
+#         if len(self.conversation_history) > 20:
+#             self.conversation_history = self.conversation_history[-20:]
 # Initialize global conversation engine
 conversation_engine = ConversationEngine()
                     voice_status = gr.Textbox(label="Voice Status", value="No voice saved yet")
                 # Language selector and controls for chat
+                # with gr.Row(elem_classes=["chat-controls"]):
+                #     language_selector = gr.Dropdown(
+                #         choices=["ml-IN", "en-US", "hi-IN", "ta-IN", "te-IN", "kn-IN"],
+                #         value="ml-IN",
+                #         label="Speech Recognition Language"
+                #     )
                     clear_btn = gr.Button("🧹 Clear Chat", scale=0)
                 # Chat display area
                     elem_classes=["chat-window"]
                 )
+                # # Progress bar for TTS generation
+                # with gr.Row():
+                #     tts_progress = gr.Slider(
+                #         minimum=0,
+                #         maximum=100,
+                #         value=0,
+                #         label="TTS Progress",
+                #         interactive=False
+                #     )
                 # Audio output for the bot's response
                 audio_output = gr.Audio(
                     visible=True
                 )
+                # # Status message for debugging
+                # status_msg = gr.Textbox(
+                #     label="Status",
+                #     value="Ready",
+                #     interactive=False
+                # )
                 # Input area with separate components
                 with gr.Row(elem_classes=["input-area"]):