Update app.py
Browse files
app.py
CHANGED
@@ -628,275 +628,275 @@ class StreamingTTS:
|
|
628 |
return self.output_file
|
629 |
return None
|
630 |
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
|
639 |
-
#
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
#
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
#
|
652 |
-
|
653 |
|
654 |
-
#
|
655 |
-
|
656 |
|
657 |
-
#
|
658 |
-
|
659 |
-
|
660 |
|
661 |
-
#
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
#
|
674 |
-
|
675 |
-
|
676 |
-
#
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
#
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
#
|
691 |
-
|
692 |
-
|
693 |
-
#
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
#
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
|
728 |
-
#
|
729 |
-
|
730 |
|
731 |
-
#
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
#
|
751 |
-
|
752 |
-
|
753 |
-
#
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
#
|
759 |
-
|
760 |
-
|
761 |
-
#
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
771 |
-
|
772 |
-
|
773 |
-
|
774 |
-
|
775 |
-
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
|
791 |
-
#
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
#
|
798 |
-
|
799 |
-
|
800 |
|
801 |
-
#
|
802 |
-
|
803 |
-
|
804 |
-
|
805 |
|
806 |
-
#
|
807 |
-
|
808 |
-
|
809 |
-
#
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
#
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
#
|
819 |
-
|
820 |
-
#
|
821 |
-
|
822 |
-
|
823 |
-
|
824 |
-
|
825 |
-
|
826 |
|
827 |
-
|
828 |
-
|
829 |
|
830 |
-
#
|
831 |
-
|
832 |
|
833 |
-
#
|
834 |
-
|
835 |
|
836 |
-
#
|
837 |
-
|
838 |
|
839 |
-
|
840 |
-
|
841 |
|
842 |
-
|
843 |
-
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
#
|
848 |
-
#
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
|
871 |
|
872 |
-
|
873 |
-
|
874 |
-
|
875 |
|
876 |
-
|
877 |
|
878 |
-
#
|
879 |
-
|
880 |
-
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
|
886 |
-
#
|
887 |
-
|
888 |
|
889 |
-
|
890 |
-
|
891 |
-
|
892 |
-
|
893 |
|
894 |
-
|
895 |
-
|
896 |
|
897 |
-
#
|
898 |
-
|
899 |
-
|
900 |
|
901 |
# Initialize global conversation engine
|
902 |
conversation_engine = ConversationEngine()
|
|
|
628 |
return self.output_file
|
629 |
return None
|
630 |
|
631 |
+
class ConversationEngine:
|
632 |
+
def __init__(self):
|
633 |
+
self.conversation_history = []
|
634 |
+
self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
|
635 |
+
self.saved_voice = None
|
636 |
+
self.saved_voice_text = ""
|
637 |
+
self.tts_cache = {} # Cache for TTS outputs
|
638 |
|
639 |
+
# TTS background processing queue
|
640 |
+
self.tts_queue = queue.Queue()
|
641 |
+
self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
|
642 |
+
self.tts_thread.start()
|
643 |
+
|
644 |
+
# Initialize streaming TTS
|
645 |
+
self.streaming_tts = StreamingTTS()
|
646 |
+
|
647 |
+
def tts_worker(self):
|
648 |
+
"""Background worker to process TTS requests"""
|
649 |
+
while True:
|
650 |
+
try:
|
651 |
+
# Get text and callback from queue
|
652 |
+
text, callback = self.tts_queue.get()
|
653 |
|
654 |
+
# Generate speech
|
655 |
+
audio_path = self._generate_tts(text)
|
656 |
|
657 |
+
# Execute callback with result
|
658 |
+
if callback:
|
659 |
+
callback(audio_path)
|
660 |
|
661 |
+
# Mark task as done
|
662 |
+
self.tts_queue.task_done()
|
663 |
+
except Exception as e:
|
664 |
+
print(f"Error in TTS worker: {e}")
|
665 |
+
traceback.print_exc()
|
666 |
+
|
667 |
+
def transcribe_audio(self, audio_data, language="ml-IN"):
|
668 |
+
"""Convert audio to text using speech recognition"""
|
669 |
+
if audio_data is None:
|
670 |
+
print("No audio data received")
|
671 |
+
return "No audio detected", ""
|
672 |
+
|
673 |
+
# Make sure we have audio data in the expected format
|
674 |
+
try:
|
675 |
+
if isinstance(audio_data, tuple) and len(audio_data) == 2:
|
676 |
+
# Expected format: (sample_rate, audio_samples)
|
677 |
+
sample_rate, audio_samples = audio_data
|
678 |
+
else:
|
679 |
+
print(f"Unexpected audio format: {type(audio_data)}")
|
680 |
+
return "Invalid audio format", ""
|
681 |
+
|
682 |
+
if len(audio_samples) == 0:
|
683 |
+
print("Empty audio samples")
|
684 |
+
return "No speech detected", ""
|
685 |
+
|
686 |
+
# Save the audio temporarily
|
687 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
688 |
+
temp_file.close()
|
689 |
+
|
690 |
+
# Save the audio data to the temporary file
|
691 |
+
sf.write(temp_file.name, audio_samples, sample_rate)
|
692 |
+
|
693 |
+
# Use speech recognition on the file
|
694 |
+
recognizer = sr.Recognizer()
|
695 |
+
with sr.AudioFile(temp_file.name) as source:
|
696 |
+
audio = recognizer.record(source)
|
697 |
+
|
698 |
+
text = recognizer.recognize_google(audio, language=language)
|
699 |
+
print(f"Recognized: {text}")
|
700 |
+
return text, text
|
701 |
+
|
702 |
+
except sr.UnknownValueError:
|
703 |
+
print("Speech recognition could not understand audio")
|
704 |
+
return "Could not understand audio", ""
|
705 |
+
except sr.RequestError as e:
|
706 |
+
print(f"Could not request results from Google Speech Recognition service: {e}")
|
707 |
+
return f"Speech recognition service error: {str(e)}", ""
|
708 |
+
except Exception as e:
|
709 |
+
print(f"Error processing audio: {e}")
|
710 |
+
traceback.print_exc()
|
711 |
+
return f"Error processing audio: {str(e)}", ""
|
712 |
+
finally:
|
713 |
+
# Clean up temporary file
|
714 |
+
if 'temp_file' in locals() and os.path.exists(temp_file.name):
|
715 |
+
try:
|
716 |
+
os.unlink(temp_file.name)
|
717 |
+
except Exception as e:
|
718 |
+
print(f"Error deleting temporary file: {e}")
|
719 |
+
|
720 |
+
def save_reference_voice(self, audio_data, reference_text):
|
721 |
+
"""Save the reference voice for future TTS generation"""
|
722 |
+
if audio_data is None or not reference_text.strip():
|
723 |
+
return "Error: Both reference audio and text are required"
|
724 |
+
|
725 |
+
self.saved_voice = audio_data
|
726 |
+
self.saved_voice_text = reference_text.strip()
|
727 |
|
728 |
+
# Clear TTS cache when voice changes
|
729 |
+
self.tts_cache.clear()
|
730 |
|
731 |
+
# Debug info
|
732 |
+
sample_rate, audio_samples = audio_data
|
733 |
+
print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
|
734 |
+
print(f"Reference text: {reference_text}")
|
735 |
|
736 |
+
return f"Voice saved successfully! Reference text: {reference_text}"
|
737 |
+
|
738 |
+
def process_text_input(self, text):
|
739 |
+
"""Process text input from user"""
|
740 |
+
if text and text.strip():
|
741 |
+
return text, text
|
742 |
+
return "No input provided", ""
|
743 |
+
|
744 |
+
def generate_response(self, input_text):
|
745 |
+
"""Generate AI response using GPT-3.5 Turbo"""
|
746 |
+
if not input_text or not input_text.strip():
|
747 |
+
return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None # "No input received. Please try again."
|
748 |
+
|
749 |
+
try:
|
750 |
+
# Prepare conversation context from history
|
751 |
+
messages = [{"role": "system", "content": self.system_prompt}]
|
752 |
+
|
753 |
+
# Add previous conversations for context
|
754 |
+
for entry in self.conversation_history:
|
755 |
+
role = "user" if entry["role"] == "user" else "assistant"
|
756 |
+
messages.append({"role": role, "content": entry["content"]})
|
757 |
+
|
758 |
+
# Add current input
|
759 |
+
messages.append({"role": "user", "content": input_text})
|
760 |
+
|
761 |
+
# Call OpenAI API
|
762 |
+
response = openai.ChatCompletion.create(
|
763 |
+
model="gpt-3.5-turbo",
|
764 |
+
messages=messages,
|
765 |
+
max_tokens=500,
|
766 |
+
temperature=0.7
|
767 |
+
)
|
768 |
+
|
769 |
+
response_text = response.choices[0].message["content"].strip()
|
770 |
+
return response_text, None
|
771 |
+
|
772 |
+
except Exception as e:
|
773 |
+
error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
|
774 |
+
print(f"Error in GPT response: {e}")
|
775 |
+
traceback.print_exc()
|
776 |
+
return error_msg, None
|
777 |
+
|
778 |
+
def resample_audio(self, audio, orig_sr, target_sr):
|
779 |
+
"""Resample audio to match target sample rate only if necessary"""
|
780 |
+
if orig_sr != target_sr:
|
781 |
+
print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
|
782 |
+
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
783 |
+
return audio
|
784 |
+
|
785 |
+
def _generate_tts(self, text):
|
786 |
+
"""Internal method to generate TTS without threading"""
|
787 |
+
if not text or not text.strip():
|
788 |
+
print("No text provided for TTS generation")
|
789 |
+
return None
|
790 |
|
791 |
+
# Check cache first
|
792 |
+
if text in self.tts_cache:
|
793 |
+
print("Using cached TTS output")
|
794 |
+
return self.tts_cache[text]
|
795 |
+
|
796 |
+
try:
|
797 |
+
# Check if we have a saved voice and the TTS model
|
798 |
+
if self.saved_voice is not None and tts_model is not None:
|
799 |
+
sample_rate, audio_data = self.saved_voice
|
800 |
|
801 |
+
# Create a temporary file for the reference audio
|
802 |
+
ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
803 |
+
ref_temp_file.close()
|
804 |
+
print(f"Saving reference audio to {ref_temp_file.name}")
|
805 |
|
806 |
+
# Save the reference audio data
|
807 |
+
sf.write(ref_temp_file.name, audio_data, sample_rate)
|
808 |
+
|
809 |
+
# Create a temporary file for the output audio
|
810 |
+
output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
811 |
+
output_temp_file.close()
|
812 |
+
|
813 |
+
try:
|
814 |
+
# Generate speech using IndicF5 - simplified approach from second file
|
815 |
+
print(f"Generating speech with IndicF5. Text: {text[:30]}...")
|
816 |
+
start_time = time.time()
|
817 |
+
|
818 |
+
# Use torch.no_grad() to save memory and computation
|
819 |
+
with torch.no_grad():
|
820 |
+
# Run the inference using the wrapper
|
821 |
+
synth_audio = tts_model_wrapper.generate(
|
822 |
+
text,
|
823 |
+
ref_audio_path=ref_temp_file.name,
|
824 |
+
ref_text=self.saved_voice_text
|
825 |
+
)
|
826 |
|
827 |
+
end_time = time.time()
|
828 |
+
print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
|
829 |
|
830 |
+
# Process audio for better quality
|
831 |
+
synth_audio = enhance_audio(synth_audio)
|
832 |
|
833 |
+
# Save the synthesized audio
|
834 |
+
sf.write(output_temp_file.name, synth_audio, 24000) # IndicF5 uses 24kHz
|
835 |
|
836 |
+
# Add to cache
|
837 |
+
self.tts_cache[text] = output_temp_file.name
|
838 |
|
839 |
+
print(f"TTS output saved to {output_temp_file.name}")
|
840 |
+
return output_temp_file.name
|
841 |
|
842 |
+
except Exception as e:
|
843 |
+
print(f"Error generating speech: {e}")
|
844 |
+
traceback.print_exc()
|
845 |
+
return None
|
846 |
+
finally:
|
847 |
+
# We don't delete the output file as it's returned to the caller
|
848 |
+
# But clean up reference file
|
849 |
+
try:
|
850 |
+
os.unlink(ref_temp_file.name)
|
851 |
+
except Exception as e:
|
852 |
+
print(f"Error cleaning up reference file: {e}")
|
853 |
+
else:
|
854 |
+
print("No saved voice reference or TTS model not loaded")
|
855 |
+
return None
|
856 |
+
except Exception as e:
|
857 |
+
print(f"Error in TTS processing: {e}")
|
858 |
+
traceback.print_exc()
|
859 |
+
return None
|
860 |
+
|
861 |
+
def queue_tts_generation(self, text, callback=None):
|
862 |
+
"""Queue TTS generation in background thread"""
|
863 |
+
print(f"Queueing TTS generation for text: {text[:30]}...")
|
864 |
+
self.tts_queue.put((text, callback))
|
865 |
+
|
866 |
+
def generate_streamed_speech(self, text):
|
867 |
+
"""Generate speech in a streaming manner for low latency"""
|
868 |
+
if not self.saved_voice:
|
869 |
+
print("No reference voice saved")
|
870 |
+
return None
|
871 |
|
872 |
+
if not text or not text.strip():
|
873 |
+
print("No text provided for streaming TTS")
|
874 |
+
return None
|
875 |
|
876 |
+
sample_rate, audio_data = self.saved_voice
|
877 |
|
878 |
+
# Start streaming generation
|
879 |
+
self.streaming_tts.generate(
|
880 |
+
text=text,
|
881 |
+
ref_audio=audio_data,
|
882 |
+
ref_sr=sample_rate,
|
883 |
+
ref_text=self.saved_voice_text
|
884 |
+
)
|
885 |
|
886 |
+
# Return the path that will be populated
|
887 |
+
return self.streaming_tts.output_file
|
888 |
|
889 |
+
def update_history(self, user_input, ai_response):
|
890 |
+
"""Update conversation history"""
|
891 |
+
if user_input and user_input.strip():
|
892 |
+
self.conversation_history.append({"role": "user", "content": user_input})
|
893 |
|
894 |
+
if ai_response and ai_response.strip():
|
895 |
+
self.conversation_history.append({"role": "assistant", "content": ai_response})
|
896 |
|
897 |
+
# Limit history size
|
898 |
+
if len(self.conversation_history) > 20:
|
899 |
+
self.conversation_history = self.conversation_history[-20:]
|
900 |
|
901 |
# Initialize global conversation engine
|
902 |
conversation_engine = ConversationEngine()
|