Spaces:

ceymox
/

TTS-Live_conversation_engine-AP

Running

App Files Files Community

ceymox commited on Apr 30

Commit

3ddece1

verified ·

1 Parent(s): b097cb6

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -246

app.py CHANGED Viewed

@@ -628,275 +628,275 @@ class StreamingTTS:
                 return self.output_file
         return None
-# class ConversationEngine:
-#     def __init__(self):
-#         self.conversation_history = []
-#         self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
-#         self.saved_voice = None
-#         self.saved_voice_text = ""
-#         self.tts_cache = {}  # Cache for TTS outputs
-#         # TTS background processing queue
-#         self.tts_queue = queue.Queue()
-#         self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
-#         self.tts_thread.start()
-#         # Initialize streaming TTS
-#         self.streaming_tts = StreamingTTS()
-#     def tts_worker(self):
-#         """Background worker to process TTS requests"""
-#         while True:
-#             try:
-#                 # Get text and callback from queue
-#                 text, callback = self.tts_queue.get()
-#                 # Generate speech
-#                 audio_path = self._generate_tts(text)
-#                 # Execute callback with result
-#                 if callback:
-#                     callback(audio_path)
-#                 # Mark task as done
-#                 self.tts_queue.task_done()
-#             except Exception as e:
-#                 print(f"Error in TTS worker: {e}")
-#                 traceback.print_exc()
-#     def transcribe_audio(self, audio_data, language="ml-IN"):
-#         """Convert audio to text using speech recognition"""
-#         if audio_data is None:
-#             print("No audio data received")
-#             return "No audio detected", ""
-#         # Make sure we have audio data in the expected format
-#         try:
-#             if isinstance(audio_data, tuple) and len(audio_data) == 2:
-#                 # Expected format: (sample_rate, audio_samples)
-#                 sample_rate, audio_samples = audio_data
-#             else:
-#                 print(f"Unexpected audio format: {type(audio_data)}")
-#                 return "Invalid audio format", ""
-#             if len(audio_samples) == 0:
-#                 print("Empty audio samples")
-#                 return "No speech detected", ""
-#             # Save the audio temporarily
-#             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-#             temp_file.close()
-#             # Save the audio data to the temporary file
-#             sf.write(temp_file.name, audio_samples, sample_rate)
-#             # Use speech recognition on the file
-#             recognizer = sr.Recognizer()
-#             with sr.AudioFile(temp_file.name) as source:
-#                 audio = recognizer.record(source)
-#             text = recognizer.recognize_google(audio, language=language)
-#             print(f"Recognized: {text}")
-#             return text, text
-#         except sr.UnknownValueError:
-#             print("Speech recognition could not understand audio")
-#             return "Could not understand audio", ""
-#         except sr.RequestError as e:
-#             print(f"Could not request results from Google Speech Recognition service: {e}")
-#             return f"Speech recognition service error: {str(e)}", ""
-#         except Exception as e:
-#             print(f"Error processing audio: {e}")
-#             traceback.print_exc()
-#             return f"Error processing audio: {str(e)}", ""
-#         finally:
-#             # Clean up temporary file
-#             if 'temp_file' in locals() and os.path.exists(temp_file.name):
-#                 try:
-#                     os.unlink(temp_file.name)
-#                 except Exception as e:
-#                     print(f"Error deleting temporary file: {e}")
-#     def save_reference_voice(self, audio_data, reference_text):
-#         """Save the reference voice for future TTS generation"""
-#         if audio_data is None or not reference_text.strip():
-#             return "Error: Both reference audio and text are required"
-#         self.saved_voice = audio_data
-#         self.saved_voice_text = reference_text.strip()
-#         # Clear TTS cache when voice changes
-#         self.tts_cache.clear()
-#         # Debug info
-#         sample_rate, audio_samples = audio_data
-#         print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
-#         print(f"Reference text: {reference_text}")
-#         return f"Voice saved successfully! Reference text: {reference_text}"
-#     def process_text_input(self, text):
-#         """Process text input from user"""
-#         if text and text.strip():
-#             return text, text
-#         return "No input provided", ""
-#     def generate_response(self, input_text):
-#         """Generate AI response using GPT-3.5 Turbo"""
-#         if not input_text or not input_text.strip():
-#             return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None  # "No input received. Please try again."
-#         try:
-#             # Prepare conversation context from history
-#             messages = [{"role": "system", "content": self.system_prompt}]
-#             # Add previous conversations for context
-#             for entry in self.conversation_history:
-#                 role = "user" if entry["role"] == "user" else "assistant"
-#                 messages.append({"role": role, "content": entry["content"]})
-#             # Add current input
-#             messages.append({"role": "user", "content": input_text})
-#             # Call OpenAI API
-#             response = openai.ChatCompletion.create(
-#                 model="gpt-3.5-turbo",
-#                 messages=messages,
-#                 max_tokens=500,
-#                 temperature=0.7
-#             )
-#             response_text = response.choices[0].message["content"].strip()
-#             return response_text, None
-#         except Exception as e:
-#             error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
-#             print(f"Error in GPT response: {e}")
-#             traceback.print_exc()
-#             return error_msg, None
-#     def resample_audio(self, audio, orig_sr, target_sr):
-#         """Resample audio to match target sample rate only if necessary"""
-#         if orig_sr != target_sr:
-#             print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
-#             return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
-#         return audio
-#     def _generate_tts(self, text):
-#         """Internal method to generate TTS without threading"""
-#         if not text or not text.strip():
-#             print("No text provided for TTS generation")
-#             return None
-#         # Check cache first
-#         if text in self.tts_cache:
-#             print("Using cached TTS output")
-#             return self.tts_cache[text]
-#         try:
-#             # Check if we have a saved voice and the TTS model
-#             if self.saved_voice is not None and tts_model is not None:
-#                 sample_rate, audio_data = self.saved_voice
-#                 # Create a temporary file for the reference audio
-#                 ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-#                 ref_temp_file.close()
-#                 print(f"Saving reference audio to {ref_temp_file.name}")
-#                 # Save the reference audio data
-#                 sf.write(ref_temp_file.name, audio_data, sample_rate)
-#                 # Create a temporary file for the output audio
-#                 output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-#                 output_temp_file.close()
-#                 try:
-#                     # Generate speech using IndicF5 - simplified approach from second file
-#                     print(f"Generating speech with IndicF5. Text: {text[:30]}...")
-#                     start_time = time.time()
-#                     # Use torch.no_grad() to save memory and computation
-#                     with torch.no_grad():
-#                         # Run the inference using the wrapper
-#                         synth_audio = tts_model_wrapper.generate(
-#                             text,
-#                             ref_audio_path=ref_temp_file.name,
-#                             ref_text=self.saved_voice_text
-#                         )
-#                     end_time = time.time()
-#                     print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
-#                     # Process audio for better quality
-#                     synth_audio = enhance_audio(synth_audio)
-#                     # Save the synthesized audio
-#                     sf.write(output_temp_file.name, synth_audio, 24000)  # IndicF5 uses 24kHz
-#                     # Add to cache
-#                     self.tts_cache[text] = output_temp_file.name
-#                     print(f"TTS output saved to {output_temp_file.name}")
-#                     return output_temp_file.name
-#                 except Exception as e:
-#                     print(f"Error generating speech: {e}")
-#                     traceback.print_exc()
-#                     return None
-#                 finally:
-#                     # We don't delete the output file as it's returned to the caller
-#                     # But clean up reference file
-#                     try:
-#                         os.unlink(ref_temp_file.name)
-#                     except Exception as e:
-#                         print(f"Error cleaning up reference file: {e}")
-#             else:
-#                 print("No saved voice reference or TTS model not loaded")
-#                 return None
-#         except Exception as e:
-#             print(f"Error in TTS processing: {e}")
-#             traceback.print_exc()
-#             return None
-#     def queue_tts_generation(self, text, callback=None):
-#         """Queue TTS generation in background thread"""
-#         print(f"Queueing TTS generation for text: {text[:30]}...")
-#         self.tts_queue.put((text, callback))
-#     def generate_streamed_speech(self, text):
-#         """Generate speech in a streaming manner for low latency"""
-#         if not self.saved_voice:
-#             print("No reference voice saved")
-#             return None
-#         if not text or not text.strip():
-#             print("No text provided for streaming TTS")
-#             return None
-#         sample_rate, audio_data = self.saved_voice
-#         # Start streaming generation
-#         self.streaming_tts.generate(
-#             text=text,
-#             ref_audio=audio_data,
-#             ref_sr=sample_rate,
-#             ref_text=self.saved_voice_text
-#         )
-#         # Return the path that will be populated
-#         return self.streaming_tts.output_file
-#     def update_history(self, user_input, ai_response):
-#         """Update conversation history"""
-#         if user_input and user_input.strip():
-#             self.conversation_history.append({"role": "user", "content": user_input})
-#         if ai_response and ai_response.strip():
-#             self.conversation_history.append({"role": "assistant", "content": ai_response})
-#         # Limit history size
-#         if len(self.conversation_history) > 20:
-#             self.conversation_history = self.conversation_history[-20:]
 # Initialize global conversation engine
 conversation_engine = ConversationEngine()

                 return self.output_file
         return None
+class ConversationEngine:
+    def __init__(self):
+        self.conversation_history = []
+        self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
+        self.saved_voice = None
+        self.saved_voice_text = ""
+        self.tts_cache = {}  # Cache for TTS outputs
+        # TTS background processing queue
+        self.tts_queue = queue.Queue()
+        self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
+        self.tts_thread.start()
+        # Initialize streaming TTS
+        self.streaming_tts = StreamingTTS()
+    def tts_worker(self):
+        """Background worker to process TTS requests"""
+        while True:
+            try:
+                # Get text and callback from queue
+                text, callback = self.tts_queue.get()
+                # Generate speech
+                audio_path = self._generate_tts(text)
+                # Execute callback with result
+                if callback:
+                    callback(audio_path)
+                # Mark task as done
+                self.tts_queue.task_done()
+            except Exception as e:
+                print(f"Error in TTS worker: {e}")
+                traceback.print_exc()
+    def transcribe_audio(self, audio_data, language="ml-IN"):
+        """Convert audio to text using speech recognition"""
+        if audio_data is None:
+            print("No audio data received")
+            return "No audio detected", ""
+        # Make sure we have audio data in the expected format
+        try:
+            if isinstance(audio_data, tuple) and len(audio_data) == 2:
+                # Expected format: (sample_rate, audio_samples)
+                sample_rate, audio_samples = audio_data
+            else:
+                print(f"Unexpected audio format: {type(audio_data)}")
+                return "Invalid audio format", ""
+            if len(audio_samples) == 0:
+                print("Empty audio samples")
+                return "No speech detected", ""
+            # Save the audio temporarily
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+            temp_file.close()
+            # Save the audio data to the temporary file
+            sf.write(temp_file.name, audio_samples, sample_rate)
+            # Use speech recognition on the file
+            recognizer = sr.Recognizer()
+            with sr.AudioFile(temp_file.name) as source:
+                audio = recognizer.record(source)
+            text = recognizer.recognize_google(audio, language=language)
+            print(f"Recognized: {text}")
+            return text, text
+        except sr.UnknownValueError:
+            print("Speech recognition could not understand audio")
+            return "Could not understand audio", ""
+        except sr.RequestError as e:
+            print(f"Could not request results from Google Speech Recognition service: {e}")
+            return f"Speech recognition service error: {str(e)}", ""
+        except Exception as e:
+            print(f"Error processing audio: {e}")
+            traceback.print_exc()
+            return f"Error processing audio: {str(e)}", ""
+        finally:
+            # Clean up temporary file
+            if 'temp_file' in locals() and os.path.exists(temp_file.name):
+                try:
+                    os.unlink(temp_file.name)
+                except Exception as e:
+                    print(f"Error deleting temporary file: {e}")
+    def save_reference_voice(self, audio_data, reference_text):
+        """Save the reference voice for future TTS generation"""
+        if audio_data is None or not reference_text.strip():
+            return "Error: Both reference audio and text are required"
+        self.saved_voice = audio_data
+        self.saved_voice_text = reference_text.strip()
+        # Clear TTS cache when voice changes
+        self.tts_cache.clear()
+        # Debug info
+        sample_rate, audio_samples = audio_data
+        print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
+        print(f"Reference text: {reference_text}")
+        return f"Voice saved successfully! Reference text: {reference_text}"
+    def process_text_input(self, text):
+        """Process text input from user"""
+        if text and text.strip():
+            return text, text
+        return "No input provided", ""
+    def generate_response(self, input_text):
+        """Generate AI response using GPT-3.5 Turbo"""
+        if not input_text or not input_text.strip():
+            return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None  # "No input received. Please try again."
+        try:
+            # Prepare conversation context from history
+            messages = [{"role": "system", "content": self.system_prompt}]
+            # Add previous conversations for context
+            for entry in self.conversation_history:
+                role = "user" if entry["role"] == "user" else "assistant"
+                messages.append({"role": role, "content": entry["content"]})
+            # Add current input
+            messages.append({"role": "user", "content": input_text})
+            # Call OpenAI API
+            response = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=messages,
+                max_tokens=500,
+                temperature=0.7
+            )
+            response_text = response.choices[0].message["content"].strip()
+            return response_text, None
+        except Exception as e:
+            error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
+            print(f"Error in GPT response: {e}")
+            traceback.print_exc()
+            return error_msg, None
+    def resample_audio(self, audio, orig_sr, target_sr):
+        """Resample audio to match target sample rate only if necessary"""
+        if orig_sr != target_sr:
+            print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
+            return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+        return audio
+    def _generate_tts(self, text):
+        """Internal method to generate TTS without threading"""
+        if not text or not text.strip():
+            print("No text provided for TTS generation")
+            return None
+        # Check cache first
+        if text in self.tts_cache:
+            print("Using cached TTS output")
+            return self.tts_cache[text]
+        try:
+            # Check if we have a saved voice and the TTS model
+            if self.saved_voice is not None and tts_model is not None:
+                sample_rate, audio_data = self.saved_voice
+                # Create a temporary file for the reference audio
+                ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+                ref_temp_file.close()
+                print(f"Saving reference audio to {ref_temp_file.name}")
+                # Save the reference audio data
+                sf.write(ref_temp_file.name, audio_data, sample_rate)
+                # Create a temporary file for the output audio
+                output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+                output_temp_file.close()
+                try:
+                    # Generate speech using IndicF5 - simplified approach from second file
+                    print(f"Generating speech with IndicF5. Text: {text[:30]}...")
+                    start_time = time.time()
+                    # Use torch.no_grad() to save memory and computation
+                    with torch.no_grad():
+                        # Run the inference using the wrapper
+                        synth_audio = tts_model_wrapper.generate(
+                            text,
+                            ref_audio_path=ref_temp_file.name,
+                            ref_text=self.saved_voice_text
+                        )
+                    end_time = time.time()
+                    print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
+                    # Process audio for better quality
+                    synth_audio = enhance_audio(synth_audio)
+                    # Save the synthesized audio
+                    sf.write(output_temp_file.name, synth_audio, 24000)  # IndicF5 uses 24kHz
+                    # Add to cache
+                    self.tts_cache[text] = output_temp_file.name
+                    print(f"TTS output saved to {output_temp_file.name}")
+                    return output_temp_file.name
+                except Exception as e:
+                    print(f"Error generating speech: {e}")
+                    traceback.print_exc()
+                    return None
+                finally:
+                    # We don't delete the output file as it's returned to the caller
+                    # But clean up reference file
+                    try:
+                        os.unlink(ref_temp_file.name)
+                    except Exception as e:
+                        print(f"Error cleaning up reference file: {e}")
+            else:
+                print("No saved voice reference or TTS model not loaded")
+                return None
+        except Exception as e:
+            print(f"Error in TTS processing: {e}")
+            traceback.print_exc()
+            return None
+    def queue_tts_generation(self, text, callback=None):
+        """Queue TTS generation in background thread"""
+        print(f"Queueing TTS generation for text: {text[:30]}...")
+        self.tts_queue.put((text, callback))
+    def generate_streamed_speech(self, text):
+        """Generate speech in a streaming manner for low latency"""
+        if not self.saved_voice:
+            print("No reference voice saved")
+            return None
+        if not text or not text.strip():
+            print("No text provided for streaming TTS")
+            return None
+        sample_rate, audio_data = self.saved_voice
+        # Start streaming generation
+        self.streaming_tts.generate(
+            text=text,
+            ref_audio=audio_data,
+            ref_sr=sample_rate,
+            ref_text=self.saved_voice_text
+        )
+        # Return the path that will be populated
+        return self.streaming_tts.output_file
+    def update_history(self, user_input, ai_response):
+        """Update conversation history"""
+        if user_input and user_input.strip():
+            self.conversation_history.append({"role": "user", "content": user_input})
+        if ai_response and ai_response.strip():
+            self.conversation_history.append({"role": "assistant", "content": ai_response})
+        # Limit history size
+        if len(self.conversation_history) > 20:
+            self.conversation_history = self.conversation_history[-20:]
 # Initialize global conversation engine
 conversation_engine = ConversationEngine()