Update app.py
Browse files
app.py
CHANGED
@@ -628,275 +628,275 @@ class StreamingTTS:
|
|
628 |
return self.output_file
|
629 |
return None
|
630 |
|
631 |
-
class ConversationEngine:
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
|
654 |
-
|
655 |
-
|
656 |
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
|
728 |
-
|
729 |
-
|
730 |
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
771 |
-
|
772 |
-
|
773 |
-
|
774 |
-
|
775 |
-
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
|
801 |
-
|
802 |
-
|
803 |
-
|
804 |
-
|
805 |
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
-
|
824 |
-
|
825 |
-
|
826 |
|
827 |
-
|
828 |
-
|
829 |
|
830 |
-
|
831 |
-
|
832 |
|
833 |
-
|
834 |
-
|
835 |
|
836 |
-
|
837 |
-
|
838 |
|
839 |
-
|
840 |
-
|
841 |
|
842 |
-
|
843 |
-
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
|
871 |
|
872 |
-
|
873 |
-
|
874 |
-
|
875 |
|
876 |
-
|
877 |
|
878 |
-
|
879 |
-
|
880 |
-
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
|
886 |
-
|
887 |
-
|
888 |
|
889 |
-
|
890 |
-
|
891 |
-
|
892 |
-
|
893 |
|
894 |
-
|
895 |
-
|
896 |
|
897 |
-
|
898 |
-
|
899 |
-
|
900 |
|
901 |
# Initialize global conversation engine
|
902 |
conversation_engine = ConversationEngine()
|
@@ -1421,12 +1421,12 @@ def create_chatbot_interface():
|
|
1421 |
voice_status = gr.Textbox(label="Voice Status", value="No voice saved yet")
|
1422 |
|
1423 |
# Language selector and controls for chat
|
1424 |
-
with gr.Row(elem_classes=["chat-controls"]):
|
1425 |
-
|
1426 |
-
|
1427 |
-
|
1428 |
-
|
1429 |
-
|
1430 |
clear_btn = gr.Button("🧹 Clear Chat", scale=0)
|
1431 |
|
1432 |
# Chat display area
|
@@ -1438,15 +1438,15 @@ def create_chatbot_interface():
|
|
1438 |
elem_classes=["chat-window"]
|
1439 |
)
|
1440 |
|
1441 |
-
# Progress bar for TTS generation
|
1442 |
-
with gr.Row():
|
1443 |
-
|
1444 |
-
|
1445 |
-
|
1446 |
-
|
1447 |
-
|
1448 |
-
|
1449 |
-
|
1450 |
|
1451 |
# Audio output for the bot's response
|
1452 |
audio_output = gr.Audio(
|
@@ -1456,12 +1456,12 @@ def create_chatbot_interface():
|
|
1456 |
visible=True
|
1457 |
)
|
1458 |
|
1459 |
-
# Status message for debugging
|
1460 |
-
status_msg = gr.Textbox(
|
1461 |
-
|
1462 |
-
|
1463 |
-
|
1464 |
-
)
|
1465 |
|
1466 |
# Input area with separate components
|
1467 |
with gr.Row(elem_classes=["input-area"]):
|
|
|
628 |
return self.output_file
|
629 |
return None
|
630 |
|
631 |
+
# class ConversationEngine:
|
632 |
+
# def __init__(self):
|
633 |
+
# self.conversation_history = []
|
634 |
+
# self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
|
635 |
+
# self.saved_voice = None
|
636 |
+
# self.saved_voice_text = ""
|
637 |
+
# self.tts_cache = {} # Cache for TTS outputs
|
638 |
|
639 |
+
# # TTS background processing queue
|
640 |
+
# self.tts_queue = queue.Queue()
|
641 |
+
# self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
|
642 |
+
# self.tts_thread.start()
|
643 |
+
|
644 |
+
# # Initialize streaming TTS
|
645 |
+
# self.streaming_tts = StreamingTTS()
|
646 |
+
|
647 |
+
# def tts_worker(self):
|
648 |
+
# """Background worker to process TTS requests"""
|
649 |
+
# while True:
|
650 |
+
# try:
|
651 |
+
# # Get text and callback from queue
|
652 |
+
# text, callback = self.tts_queue.get()
|
653 |
|
654 |
+
# # Generate speech
|
655 |
+
# audio_path = self._generate_tts(text)
|
656 |
|
657 |
+
# # Execute callback with result
|
658 |
+
# if callback:
|
659 |
+
# callback(audio_path)
|
660 |
|
661 |
+
# # Mark task as done
|
662 |
+
# self.tts_queue.task_done()
|
663 |
+
# except Exception as e:
|
664 |
+
# print(f"Error in TTS worker: {e}")
|
665 |
+
# traceback.print_exc()
|
666 |
+
|
667 |
+
# def transcribe_audio(self, audio_data, language="ml-IN"):
|
668 |
+
# """Convert audio to text using speech recognition"""
|
669 |
+
# if audio_data is None:
|
670 |
+
# print("No audio data received")
|
671 |
+
# return "No audio detected", ""
|
672 |
+
|
673 |
+
# # Make sure we have audio data in the expected format
|
674 |
+
# try:
|
675 |
+
# if isinstance(audio_data, tuple) and len(audio_data) == 2:
|
676 |
+
# # Expected format: (sample_rate, audio_samples)
|
677 |
+
# sample_rate, audio_samples = audio_data
|
678 |
+
# else:
|
679 |
+
# print(f"Unexpected audio format: {type(audio_data)}")
|
680 |
+
# return "Invalid audio format", ""
|
681 |
+
|
682 |
+
# if len(audio_samples) == 0:
|
683 |
+
# print("Empty audio samples")
|
684 |
+
# return "No speech detected", ""
|
685 |
+
|
686 |
+
# # Save the audio temporarily
|
687 |
+
# temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
688 |
+
# temp_file.close()
|
689 |
+
|
690 |
+
# # Save the audio data to the temporary file
|
691 |
+
# sf.write(temp_file.name, audio_samples, sample_rate)
|
692 |
+
|
693 |
+
# # Use speech recognition on the file
|
694 |
+
# recognizer = sr.Recognizer()
|
695 |
+
# with sr.AudioFile(temp_file.name) as source:
|
696 |
+
# audio = recognizer.record(source)
|
697 |
+
|
698 |
+
# text = recognizer.recognize_google(audio, language=language)
|
699 |
+
# print(f"Recognized: {text}")
|
700 |
+
# return text, text
|
701 |
+
|
702 |
+
# except sr.UnknownValueError:
|
703 |
+
# print("Speech recognition could not understand audio")
|
704 |
+
# return "Could not understand audio", ""
|
705 |
+
# except sr.RequestError as e:
|
706 |
+
# print(f"Could not request results from Google Speech Recognition service: {e}")
|
707 |
+
# return f"Speech recognition service error: {str(e)}", ""
|
708 |
+
# except Exception as e:
|
709 |
+
# print(f"Error processing audio: {e}")
|
710 |
+
# traceback.print_exc()
|
711 |
+
# return f"Error processing audio: {str(e)}", ""
|
712 |
+
# finally:
|
713 |
+
# # Clean up temporary file
|
714 |
+
# if 'temp_file' in locals() and os.path.exists(temp_file.name):
|
715 |
+
# try:
|
716 |
+
# os.unlink(temp_file.name)
|
717 |
+
# except Exception as e:
|
718 |
+
# print(f"Error deleting temporary file: {e}")
|
719 |
+
|
720 |
+
# def save_reference_voice(self, audio_data, reference_text):
|
721 |
+
# """Save the reference voice for future TTS generation"""
|
722 |
+
# if audio_data is None or not reference_text.strip():
|
723 |
+
# return "Error: Both reference audio and text are required"
|
724 |
+
|
725 |
+
# self.saved_voice = audio_data
|
726 |
+
# self.saved_voice_text = reference_text.strip()
|
727 |
|
728 |
+
# # Clear TTS cache when voice changes
|
729 |
+
# self.tts_cache.clear()
|
730 |
|
731 |
+
# # Debug info
|
732 |
+
# sample_rate, audio_samples = audio_data
|
733 |
+
# print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
|
734 |
+
# print(f"Reference text: {reference_text}")
|
735 |
|
736 |
+
# return f"Voice saved successfully! Reference text: {reference_text}"
|
737 |
+
|
738 |
+
# def process_text_input(self, text):
|
739 |
+
# """Process text input from user"""
|
740 |
+
# if text and text.strip():
|
741 |
+
# return text, text
|
742 |
+
# return "No input provided", ""
|
743 |
+
|
744 |
+
# def generate_response(self, input_text):
|
745 |
+
# """Generate AI response using GPT-3.5 Turbo"""
|
746 |
+
# if not input_text or not input_text.strip():
|
747 |
+
# return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None # "No input received. Please try again."
|
748 |
+
|
749 |
+
# try:
|
750 |
+
# # Prepare conversation context from history
|
751 |
+
# messages = [{"role": "system", "content": self.system_prompt}]
|
752 |
+
|
753 |
+
# # Add previous conversations for context
|
754 |
+
# for entry in self.conversation_history:
|
755 |
+
# role = "user" if entry["role"] == "user" else "assistant"
|
756 |
+
# messages.append({"role": role, "content": entry["content"]})
|
757 |
+
|
758 |
+
# # Add current input
|
759 |
+
# messages.append({"role": "user", "content": input_text})
|
760 |
+
|
761 |
+
# # Call OpenAI API
|
762 |
+
# response = openai.ChatCompletion.create(
|
763 |
+
# model="gpt-3.5-turbo",
|
764 |
+
# messages=messages,
|
765 |
+
# max_tokens=500,
|
766 |
+
# temperature=0.7
|
767 |
+
# )
|
768 |
+
|
769 |
+
# response_text = response.choices[0].message["content"].strip()
|
770 |
+
# return response_text, None
|
771 |
+
|
772 |
+
# except Exception as e:
|
773 |
+
# error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
|
774 |
+
# print(f"Error in GPT response: {e}")
|
775 |
+
# traceback.print_exc()
|
776 |
+
# return error_msg, None
|
777 |
+
|
778 |
+
# def resample_audio(self, audio, orig_sr, target_sr):
|
779 |
+
# """Resample audio to match target sample rate only if necessary"""
|
780 |
+
# if orig_sr != target_sr:
|
781 |
+
# print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
|
782 |
+
# return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
783 |
+
# return audio
|
784 |
+
|
785 |
+
# def _generate_tts(self, text):
|
786 |
+
# """Internal method to generate TTS without threading"""
|
787 |
+
# if not text or not text.strip():
|
788 |
+
# print("No text provided for TTS generation")
|
789 |
+
# return None
|
790 |
|
791 |
+
# # Check cache first
|
792 |
+
# if text in self.tts_cache:
|
793 |
+
# print("Using cached TTS output")
|
794 |
+
# return self.tts_cache[text]
|
795 |
+
|
796 |
+
# try:
|
797 |
+
# # Check if we have a saved voice and the TTS model
|
798 |
+
# if self.saved_voice is not None and tts_model is not None:
|
799 |
+
# sample_rate, audio_data = self.saved_voice
|
800 |
|
801 |
+
# # Create a temporary file for the reference audio
|
802 |
+
# ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
803 |
+
# ref_temp_file.close()
|
804 |
+
# print(f"Saving reference audio to {ref_temp_file.name}")
|
805 |
|
806 |
+
# # Save the reference audio data
|
807 |
+
# sf.write(ref_temp_file.name, audio_data, sample_rate)
|
808 |
+
|
809 |
+
# # Create a temporary file for the output audio
|
810 |
+
# output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
811 |
+
# output_temp_file.close()
|
812 |
+
|
813 |
+
# try:
|
814 |
+
# # Generate speech using IndicF5 - simplified approach from second file
|
815 |
+
# print(f"Generating speech with IndicF5. Text: {text[:30]}...")
|
816 |
+
# start_time = time.time()
|
817 |
+
|
818 |
+
# # Use torch.no_grad() to save memory and computation
|
819 |
+
# with torch.no_grad():
|
820 |
+
# # Run the inference using the wrapper
|
821 |
+
# synth_audio = tts_model_wrapper.generate(
|
822 |
+
# text,
|
823 |
+
# ref_audio_path=ref_temp_file.name,
|
824 |
+
# ref_text=self.saved_voice_text
|
825 |
+
# )
|
826 |
|
827 |
+
# end_time = time.time()
|
828 |
+
# print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
|
829 |
|
830 |
+
# # Process audio for better quality
|
831 |
+
# synth_audio = enhance_audio(synth_audio)
|
832 |
|
833 |
+
# # Save the synthesized audio
|
834 |
+
# sf.write(output_temp_file.name, synth_audio, 24000) # IndicF5 uses 24kHz
|
835 |
|
836 |
+
# # Add to cache
|
837 |
+
# self.tts_cache[text] = output_temp_file.name
|
838 |
|
839 |
+
# print(f"TTS output saved to {output_temp_file.name}")
|
840 |
+
# return output_temp_file.name
|
841 |
|
842 |
+
# except Exception as e:
|
843 |
+
# print(f"Error generating speech: {e}")
|
844 |
+
# traceback.print_exc()
|
845 |
+
# return None
|
846 |
+
# finally:
|
847 |
+
# # We don't delete the output file as it's returned to the caller
|
848 |
+
# # But clean up reference file
|
849 |
+
# try:
|
850 |
+
# os.unlink(ref_temp_file.name)
|
851 |
+
# except Exception as e:
|
852 |
+
# print(f"Error cleaning up reference file: {e}")
|
853 |
+
# else:
|
854 |
+
# print("No saved voice reference or TTS model not loaded")
|
855 |
+
# return None
|
856 |
+
# except Exception as e:
|
857 |
+
# print(f"Error in TTS processing: {e}")
|
858 |
+
# traceback.print_exc()
|
859 |
+
# return None
|
860 |
+
|
861 |
+
# def queue_tts_generation(self, text, callback=None):
|
862 |
+
# """Queue TTS generation in background thread"""
|
863 |
+
# print(f"Queueing TTS generation for text: {text[:30]}...")
|
864 |
+
# self.tts_queue.put((text, callback))
|
865 |
+
|
866 |
+
# def generate_streamed_speech(self, text):
|
867 |
+
# """Generate speech in a streaming manner for low latency"""
|
868 |
+
# if not self.saved_voice:
|
869 |
+
# print("No reference voice saved")
|
870 |
+
# return None
|
871 |
|
872 |
+
# if not text or not text.strip():
|
873 |
+
# print("No text provided for streaming TTS")
|
874 |
+
# return None
|
875 |
|
876 |
+
# sample_rate, audio_data = self.saved_voice
|
877 |
|
878 |
+
# # Start streaming generation
|
879 |
+
# self.streaming_tts.generate(
|
880 |
+
# text=text,
|
881 |
+
# ref_audio=audio_data,
|
882 |
+
# ref_sr=sample_rate,
|
883 |
+
# ref_text=self.saved_voice_text
|
884 |
+
# )
|
885 |
|
886 |
+
# # Return the path that will be populated
|
887 |
+
# return self.streaming_tts.output_file
|
888 |
|
889 |
+
# def update_history(self, user_input, ai_response):
|
890 |
+
# """Update conversation history"""
|
891 |
+
# if user_input and user_input.strip():
|
892 |
+
# self.conversation_history.append({"role": "user", "content": user_input})
|
893 |
|
894 |
+
# if ai_response and ai_response.strip():
|
895 |
+
# self.conversation_history.append({"role": "assistant", "content": ai_response})
|
896 |
|
897 |
+
# # Limit history size
|
898 |
+
# if len(self.conversation_history) > 20:
|
899 |
+
# self.conversation_history = self.conversation_history[-20:]
|
900 |
|
901 |
# Initialize global conversation engine
|
902 |
conversation_engine = ConversationEngine()
|
|
|
1421 |
voice_status = gr.Textbox(label="Voice Status", value="No voice saved yet")
|
1422 |
|
1423 |
# Language selector and controls for chat
|
1424 |
+
# with gr.Row(elem_classes=["chat-controls"]):
|
1425 |
+
# language_selector = gr.Dropdown(
|
1426 |
+
# choices=["ml-IN", "en-US", "hi-IN", "ta-IN", "te-IN", "kn-IN"],
|
1427 |
+
# value="ml-IN",
|
1428 |
+
# label="Speech Recognition Language"
|
1429 |
+
# )
|
1430 |
clear_btn = gr.Button("🧹 Clear Chat", scale=0)
|
1431 |
|
1432 |
# Chat display area
|
|
|
1438 |
elem_classes=["chat-window"]
|
1439 |
)
|
1440 |
|
1441 |
+
# # Progress bar for TTS generation
|
1442 |
+
# with gr.Row():
|
1443 |
+
# tts_progress = gr.Slider(
|
1444 |
+
# minimum=0,
|
1445 |
+
# maximum=100,
|
1446 |
+
# value=0,
|
1447 |
+
# label="TTS Progress",
|
1448 |
+
# interactive=False
|
1449 |
+
# )
|
1450 |
|
1451 |
# Audio output for the bot's response
|
1452 |
audio_output = gr.Audio(
|
|
|
1456 |
visible=True
|
1457 |
)
|
1458 |
|
1459 |
+
# # Status message for debugging
|
1460 |
+
# status_msg = gr.Textbox(
|
1461 |
+
# label="Status",
|
1462 |
+
# value="Ready",
|
1463 |
+
# interactive=False
|
1464 |
+
# )
|
1465 |
|
1466 |
# Input area with separate components
|
1467 |
with gr.Row(elem_classes=["input-area"]):
|