ceymox commited on
Commit
b097cb6
·
verified ·
1 Parent(s): 8905882

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +267 -267
app.py CHANGED
@@ -628,275 +628,275 @@ class StreamingTTS:
628
  return self.output_file
629
  return None
630
 
631
- class ConversationEngine:
632
- def __init__(self):
633
- self.conversation_history = []
634
- self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
635
- self.saved_voice = None
636
- self.saved_voice_text = ""
637
- self.tts_cache = {} # Cache for TTS outputs
638
 
639
- # TTS background processing queue
640
- self.tts_queue = queue.Queue()
641
- self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
642
- self.tts_thread.start()
643
-
644
- # Initialize streaming TTS
645
- self.streaming_tts = StreamingTTS()
646
-
647
- def tts_worker(self):
648
- """Background worker to process TTS requests"""
649
- while True:
650
- try:
651
- # Get text and callback from queue
652
- text, callback = self.tts_queue.get()
653
 
654
- # Generate speech
655
- audio_path = self._generate_tts(text)
656
 
657
- # Execute callback with result
658
- if callback:
659
- callback(audio_path)
660
 
661
- # Mark task as done
662
- self.tts_queue.task_done()
663
- except Exception as e:
664
- print(f"Error in TTS worker: {e}")
665
- traceback.print_exc()
666
-
667
- def transcribe_audio(self, audio_data, language="ml-IN"):
668
- """Convert audio to text using speech recognition"""
669
- if audio_data is None:
670
- print("No audio data received")
671
- return "No audio detected", ""
672
-
673
- # Make sure we have audio data in the expected format
674
- try:
675
- if isinstance(audio_data, tuple) and len(audio_data) == 2:
676
- # Expected format: (sample_rate, audio_samples)
677
- sample_rate, audio_samples = audio_data
678
- else:
679
- print(f"Unexpected audio format: {type(audio_data)}")
680
- return "Invalid audio format", ""
681
-
682
- if len(audio_samples) == 0:
683
- print("Empty audio samples")
684
- return "No speech detected", ""
685
-
686
- # Save the audio temporarily
687
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
688
- temp_file.close()
689
-
690
- # Save the audio data to the temporary file
691
- sf.write(temp_file.name, audio_samples, sample_rate)
692
-
693
- # Use speech recognition on the file
694
- recognizer = sr.Recognizer()
695
- with sr.AudioFile(temp_file.name) as source:
696
- audio = recognizer.record(source)
697
-
698
- text = recognizer.recognize_google(audio, language=language)
699
- print(f"Recognized: {text}")
700
- return text, text
701
-
702
- except sr.UnknownValueError:
703
- print("Speech recognition could not understand audio")
704
- return "Could not understand audio", ""
705
- except sr.RequestError as e:
706
- print(f"Could not request results from Google Speech Recognition service: {e}")
707
- return f"Speech recognition service error: {str(e)}", ""
708
- except Exception as e:
709
- print(f"Error processing audio: {e}")
710
- traceback.print_exc()
711
- return f"Error processing audio: {str(e)}", ""
712
- finally:
713
- # Clean up temporary file
714
- if 'temp_file' in locals() and os.path.exists(temp_file.name):
715
- try:
716
- os.unlink(temp_file.name)
717
- except Exception as e:
718
- print(f"Error deleting temporary file: {e}")
719
-
720
- def save_reference_voice(self, audio_data, reference_text):
721
- """Save the reference voice for future TTS generation"""
722
- if audio_data is None or not reference_text.strip():
723
- return "Error: Both reference audio and text are required"
724
-
725
- self.saved_voice = audio_data
726
- self.saved_voice_text = reference_text.strip()
727
 
728
- # Clear TTS cache when voice changes
729
- self.tts_cache.clear()
730
 
731
- # Debug info
732
- sample_rate, audio_samples = audio_data
733
- print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
734
- print(f"Reference text: {reference_text}")
735
 
736
- return f"Voice saved successfully! Reference text: {reference_text}"
737
-
738
- def process_text_input(self, text):
739
- """Process text input from user"""
740
- if text and text.strip():
741
- return text, text
742
- return "No input provided", ""
743
-
744
- def generate_response(self, input_text):
745
- """Generate AI response using GPT-3.5 Turbo"""
746
- if not input_text or not input_text.strip():
747
- return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None # "No input received. Please try again."
748
-
749
- try:
750
- # Prepare conversation context from history
751
- messages = [{"role": "system", "content": self.system_prompt}]
752
-
753
- # Add previous conversations for context
754
- for entry in self.conversation_history:
755
- role = "user" if entry["role"] == "user" else "assistant"
756
- messages.append({"role": role, "content": entry["content"]})
757
-
758
- # Add current input
759
- messages.append({"role": "user", "content": input_text})
760
-
761
- # Call OpenAI API
762
- response = openai.ChatCompletion.create(
763
- model="gpt-3.5-turbo",
764
- messages=messages,
765
- max_tokens=500,
766
- temperature=0.7
767
- )
768
-
769
- response_text = response.choices[0].message["content"].strip()
770
- return response_text, None
771
-
772
- except Exception as e:
773
- error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
774
- print(f"Error in GPT response: {e}")
775
- traceback.print_exc()
776
- return error_msg, None
777
-
778
- def resample_audio(self, audio, orig_sr, target_sr):
779
- """Resample audio to match target sample rate only if necessary"""
780
- if orig_sr != target_sr:
781
- print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
782
- return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
783
- return audio
784
-
785
- def _generate_tts(self, text):
786
- """Internal method to generate TTS without threading"""
787
- if not text or not text.strip():
788
- print("No text provided for TTS generation")
789
- return None
790
 
791
- # Check cache first
792
- if text in self.tts_cache:
793
- print("Using cached TTS output")
794
- return self.tts_cache[text]
795
-
796
- try:
797
- # Check if we have a saved voice and the TTS model
798
- if self.saved_voice is not None and tts_model is not None:
799
- sample_rate, audio_data = self.saved_voice
800
 
801
- # Create a temporary file for the reference audio
802
- ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
803
- ref_temp_file.close()
804
- print(f"Saving reference audio to {ref_temp_file.name}")
805
 
806
- # Save the reference audio data
807
- sf.write(ref_temp_file.name, audio_data, sample_rate)
808
-
809
- # Create a temporary file for the output audio
810
- output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
811
- output_temp_file.close()
812
-
813
- try:
814
- # Generate speech using IndicF5 - simplified approach from second file
815
- print(f"Generating speech with IndicF5. Text: {text[:30]}...")
816
- start_time = time.time()
817
-
818
- # Use torch.no_grad() to save memory and computation
819
- with torch.no_grad():
820
- # Run the inference using the wrapper
821
- synth_audio = tts_model_wrapper.generate(
822
- text,
823
- ref_audio_path=ref_temp_file.name,
824
- ref_text=self.saved_voice_text
825
- )
826
 
827
- end_time = time.time()
828
- print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
829
 
830
- # Process audio for better quality
831
- synth_audio = enhance_audio(synth_audio)
832
 
833
- # Save the synthesized audio
834
- sf.write(output_temp_file.name, synth_audio, 24000) # IndicF5 uses 24kHz
835
 
836
- # Add to cache
837
- self.tts_cache[text] = output_temp_file.name
838
 
839
- print(f"TTS output saved to {output_temp_file.name}")
840
- return output_temp_file.name
841
 
842
- except Exception as e:
843
- print(f"Error generating speech: {e}")
844
- traceback.print_exc()
845
- return None
846
- finally:
847
- # We don't delete the output file as it's returned to the caller
848
- # But clean up reference file
849
- try:
850
- os.unlink(ref_temp_file.name)
851
- except Exception as e:
852
- print(f"Error cleaning up reference file: {e}")
853
- else:
854
- print("No saved voice reference or TTS model not loaded")
855
- return None
856
- except Exception as e:
857
- print(f"Error in TTS processing: {e}")
858
- traceback.print_exc()
859
- return None
860
-
861
- def queue_tts_generation(self, text, callback=None):
862
- """Queue TTS generation in background thread"""
863
- print(f"Queueing TTS generation for text: {text[:30]}...")
864
- self.tts_queue.put((text, callback))
865
-
866
- def generate_streamed_speech(self, text):
867
- """Generate speech in a streaming manner for low latency"""
868
- if not self.saved_voice:
869
- print("No reference voice saved")
870
- return None
871
 
872
- if not text or not text.strip():
873
- print("No text provided for streaming TTS")
874
- return None
875
 
876
- sample_rate, audio_data = self.saved_voice
877
 
878
- # Start streaming generation
879
- self.streaming_tts.generate(
880
- text=text,
881
- ref_audio=audio_data,
882
- ref_sr=sample_rate,
883
- ref_text=self.saved_voice_text
884
- )
885
 
886
- # Return the path that will be populated
887
- return self.streaming_tts.output_file
888
 
889
- def update_history(self, user_input, ai_response):
890
- """Update conversation history"""
891
- if user_input and user_input.strip():
892
- self.conversation_history.append({"role": "user", "content": user_input})
893
 
894
- if ai_response and ai_response.strip():
895
- self.conversation_history.append({"role": "assistant", "content": ai_response})
896
 
897
- # Limit history size
898
- if len(self.conversation_history) > 20:
899
- self.conversation_history = self.conversation_history[-20:]
900
 
901
  # Initialize global conversation engine
902
  conversation_engine = ConversationEngine()
@@ -1421,12 +1421,12 @@ def create_chatbot_interface():
1421
  voice_status = gr.Textbox(label="Voice Status", value="No voice saved yet")
1422
 
1423
  # Language selector and controls for chat
1424
- with gr.Row(elem_classes=["chat-controls"]):
1425
- language_selector = gr.Dropdown(
1426
- choices=["ml-IN", "en-US", "hi-IN", "ta-IN", "te-IN", "kn-IN"],
1427
- value="ml-IN",
1428
- label="Speech Recognition Language"
1429
- )
1430
  clear_btn = gr.Button("🧹 Clear Chat", scale=0)
1431
 
1432
  # Chat display area
@@ -1438,15 +1438,15 @@ def create_chatbot_interface():
1438
  elem_classes=["chat-window"]
1439
  )
1440
 
1441
- # Progress bar for TTS generation
1442
- with gr.Row():
1443
- tts_progress = gr.Slider(
1444
- minimum=0,
1445
- maximum=100,
1446
- value=0,
1447
- label="TTS Progress",
1448
- interactive=False
1449
- )
1450
 
1451
  # Audio output for the bot's response
1452
  audio_output = gr.Audio(
@@ -1456,12 +1456,12 @@ def create_chatbot_interface():
1456
  visible=True
1457
  )
1458
 
1459
- # Status message for debugging
1460
- status_msg = gr.Textbox(
1461
- label="Status",
1462
- value="Ready",
1463
- interactive=False
1464
- )
1465
 
1466
  # Input area with separate components
1467
  with gr.Row(elem_classes=["input-area"]):
 
628
  return self.output_file
629
  return None
630
 
631
+ # class ConversationEngine:
632
+ # def __init__(self):
633
+ # self.conversation_history = []
634
+ # self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
635
+ # self.saved_voice = None
636
+ # self.saved_voice_text = ""
637
+ # self.tts_cache = {} # Cache for TTS outputs
638
 
639
+ # # TTS background processing queue
640
+ # self.tts_queue = queue.Queue()
641
+ # self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
642
+ # self.tts_thread.start()
643
+
644
+ # # Initialize streaming TTS
645
+ # self.streaming_tts = StreamingTTS()
646
+
647
+ # def tts_worker(self):
648
+ # """Background worker to process TTS requests"""
649
+ # while True:
650
+ # try:
651
+ # # Get text and callback from queue
652
+ # text, callback = self.tts_queue.get()
653
 
654
+ # # Generate speech
655
+ # audio_path = self._generate_tts(text)
656
 
657
+ # # Execute callback with result
658
+ # if callback:
659
+ # callback(audio_path)
660
 
661
+ # # Mark task as done
662
+ # self.tts_queue.task_done()
663
+ # except Exception as e:
664
+ # print(f"Error in TTS worker: {e}")
665
+ # traceback.print_exc()
666
+
667
+ # def transcribe_audio(self, audio_data, language="ml-IN"):
668
+ # """Convert audio to text using speech recognition"""
669
+ # if audio_data is None:
670
+ # print("No audio data received")
671
+ # return "No audio detected", ""
672
+
673
+ # # Make sure we have audio data in the expected format
674
+ # try:
675
+ # if isinstance(audio_data, tuple) and len(audio_data) == 2:
676
+ # # Expected format: (sample_rate, audio_samples)
677
+ # sample_rate, audio_samples = audio_data
678
+ # else:
679
+ # print(f"Unexpected audio format: {type(audio_data)}")
680
+ # return "Invalid audio format", ""
681
+
682
+ # if len(audio_samples) == 0:
683
+ # print("Empty audio samples")
684
+ # return "No speech detected", ""
685
+
686
+ # # Save the audio temporarily
687
+ # temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
688
+ # temp_file.close()
689
+
690
+ # # Save the audio data to the temporary file
691
+ # sf.write(temp_file.name, audio_samples, sample_rate)
692
+
693
+ # # Use speech recognition on the file
694
+ # recognizer = sr.Recognizer()
695
+ # with sr.AudioFile(temp_file.name) as source:
696
+ # audio = recognizer.record(source)
697
+
698
+ # text = recognizer.recognize_google(audio, language=language)
699
+ # print(f"Recognized: {text}")
700
+ # return text, text
701
+
702
+ # except sr.UnknownValueError:
703
+ # print("Speech recognition could not understand audio")
704
+ # return "Could not understand audio", ""
705
+ # except sr.RequestError as e:
706
+ # print(f"Could not request results from Google Speech Recognition service: {e}")
707
+ # return f"Speech recognition service error: {str(e)}", ""
708
+ # except Exception as e:
709
+ # print(f"Error processing audio: {e}")
710
+ # traceback.print_exc()
711
+ # return f"Error processing audio: {str(e)}", ""
712
+ # finally:
713
+ # # Clean up temporary file
714
+ # if 'temp_file' in locals() and os.path.exists(temp_file.name):
715
+ # try:
716
+ # os.unlink(temp_file.name)
717
+ # except Exception as e:
718
+ # print(f"Error deleting temporary file: {e}")
719
+
720
+ # def save_reference_voice(self, audio_data, reference_text):
721
+ # """Save the reference voice for future TTS generation"""
722
+ # if audio_data is None or not reference_text.strip():
723
+ # return "Error: Both reference audio and text are required"
724
+
725
+ # self.saved_voice = audio_data
726
+ # self.saved_voice_text = reference_text.strip()
727
 
728
+ # # Clear TTS cache when voice changes
729
+ # self.tts_cache.clear()
730
 
731
+ # # Debug info
732
+ # sample_rate, audio_samples = audio_data
733
+ # print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
734
+ # print(f"Reference text: {reference_text}")
735
 
736
+ # return f"Voice saved successfully! Reference text: {reference_text}"
737
+
738
+ # def process_text_input(self, text):
739
+ # """Process text input from user"""
740
+ # if text and text.strip():
741
+ # return text, text
742
+ # return "No input provided", ""
743
+
744
+ # def generate_response(self, input_text):
745
+ # """Generate AI response using GPT-3.5 Turbo"""
746
+ # if not input_text or not input_text.strip():
747
+ # return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None # "No input received. Please try again."
748
+
749
+ # try:
750
+ # # Prepare conversation context from history
751
+ # messages = [{"role": "system", "content": self.system_prompt}]
752
+
753
+ # # Add previous conversations for context
754
+ # for entry in self.conversation_history:
755
+ # role = "user" if entry["role"] == "user" else "assistant"
756
+ # messages.append({"role": role, "content": entry["content"]})
757
+
758
+ # # Add current input
759
+ # messages.append({"role": "user", "content": input_text})
760
+
761
+ # # Call OpenAI API
762
+ # response = openai.ChatCompletion.create(
763
+ # model="gpt-3.5-turbo",
764
+ # messages=messages,
765
+ # max_tokens=500,
766
+ # temperature=0.7
767
+ # )
768
+
769
+ # response_text = response.choices[0].message["content"].strip()
770
+ # return response_text, None
771
+
772
+ # except Exception as e:
773
+ # error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
774
+ # print(f"Error in GPT response: {e}")
775
+ # traceback.print_exc()
776
+ # return error_msg, None
777
+
778
+ # def resample_audio(self, audio, orig_sr, target_sr):
779
+ # """Resample audio to match target sample rate only if necessary"""
780
+ # if orig_sr != target_sr:
781
+ # print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
782
+ # return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
783
+ # return audio
784
+
785
+ # def _generate_tts(self, text):
786
+ # """Internal method to generate TTS without threading"""
787
+ # if not text or not text.strip():
788
+ # print("No text provided for TTS generation")
789
+ # return None
790
 
791
+ # # Check cache first
792
+ # if text in self.tts_cache:
793
+ # print("Using cached TTS output")
794
+ # return self.tts_cache[text]
795
+
796
+ # try:
797
+ # # Check if we have a saved voice and the TTS model
798
+ # if self.saved_voice is not None and tts_model is not None:
799
+ # sample_rate, audio_data = self.saved_voice
800
 
801
+ # # Create a temporary file for the reference audio
802
+ # ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
803
+ # ref_temp_file.close()
804
+ # print(f"Saving reference audio to {ref_temp_file.name}")
805
 
806
+ # # Save the reference audio data
807
+ # sf.write(ref_temp_file.name, audio_data, sample_rate)
808
+
809
+ # # Create a temporary file for the output audio
810
+ # output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
811
+ # output_temp_file.close()
812
+
813
+ # try:
814
+ # # Generate speech using IndicF5 - simplified approach from second file
815
+ # print(f"Generating speech with IndicF5. Text: {text[:30]}...")
816
+ # start_time = time.time()
817
+
818
+ # # Use torch.no_grad() to save memory and computation
819
+ # with torch.no_grad():
820
+ # # Run the inference using the wrapper
821
+ # synth_audio = tts_model_wrapper.generate(
822
+ # text,
823
+ # ref_audio_path=ref_temp_file.name,
824
+ # ref_text=self.saved_voice_text
825
+ # )
826
 
827
+ # end_time = time.time()
828
+ # print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
829
 
830
+ # # Process audio for better quality
831
+ # synth_audio = enhance_audio(synth_audio)
832
 
833
+ # # Save the synthesized audio
834
+ # sf.write(output_temp_file.name, synth_audio, 24000) # IndicF5 uses 24kHz
835
 
836
+ # # Add to cache
837
+ # self.tts_cache[text] = output_temp_file.name
838
 
839
+ # print(f"TTS output saved to {output_temp_file.name}")
840
+ # return output_temp_file.name
841
 
842
+ # except Exception as e:
843
+ # print(f"Error generating speech: {e}")
844
+ # traceback.print_exc()
845
+ # return None
846
+ # finally:
847
+ # # We don't delete the output file as it's returned to the caller
848
+ # # But clean up reference file
849
+ # try:
850
+ # os.unlink(ref_temp_file.name)
851
+ # except Exception as e:
852
+ # print(f"Error cleaning up reference file: {e}")
853
+ # else:
854
+ # print("No saved voice reference or TTS model not loaded")
855
+ # return None
856
+ # except Exception as e:
857
+ # print(f"Error in TTS processing: {e}")
858
+ # traceback.print_exc()
859
+ # return None
860
+
861
+ # def queue_tts_generation(self, text, callback=None):
862
+ # """Queue TTS generation in background thread"""
863
+ # print(f"Queueing TTS generation for text: {text[:30]}...")
864
+ # self.tts_queue.put((text, callback))
865
+
866
+ # def generate_streamed_speech(self, text):
867
+ # """Generate speech in a streaming manner for low latency"""
868
+ # if not self.saved_voice:
869
+ # print("No reference voice saved")
870
+ # return None
871
 
872
+ # if not text or not text.strip():
873
+ # print("No text provided for streaming TTS")
874
+ # return None
875
 
876
+ # sample_rate, audio_data = self.saved_voice
877
 
878
+ # # Start streaming generation
879
+ # self.streaming_tts.generate(
880
+ # text=text,
881
+ # ref_audio=audio_data,
882
+ # ref_sr=sample_rate,
883
+ # ref_text=self.saved_voice_text
884
+ # )
885
 
886
+ # # Return the path that will be populated
887
+ # return self.streaming_tts.output_file
888
 
889
+ # def update_history(self, user_input, ai_response):
890
+ # """Update conversation history"""
891
+ # if user_input and user_input.strip():
892
+ # self.conversation_history.append({"role": "user", "content": user_input})
893
 
894
+ # if ai_response and ai_response.strip():
895
+ # self.conversation_history.append({"role": "assistant", "content": ai_response})
896
 
897
+ # # Limit history size
898
+ # if len(self.conversation_history) > 20:
899
+ # self.conversation_history = self.conversation_history[-20:]
900
 
901
  # Initialize global conversation engine
902
  conversation_engine = ConversationEngine()
 
1421
  voice_status = gr.Textbox(label="Voice Status", value="No voice saved yet")
1422
 
1423
  # Language selector and controls for chat
1424
+ # with gr.Row(elem_classes=["chat-controls"]):
1425
+ # language_selector = gr.Dropdown(
1426
+ # choices=["ml-IN", "en-US", "hi-IN", "ta-IN", "te-IN", "kn-IN"],
1427
+ # value="ml-IN",
1428
+ # label="Speech Recognition Language"
1429
+ # )
1430
  clear_btn = gr.Button("🧹 Clear Chat", scale=0)
1431
 
1432
  # Chat display area
 
1438
  elem_classes=["chat-window"]
1439
  )
1440
 
1441
+ # # Progress bar for TTS generation
1442
+ # with gr.Row():
1443
+ # tts_progress = gr.Slider(
1444
+ # minimum=0,
1445
+ # maximum=100,
1446
+ # value=0,
1447
+ # label="TTS Progress",
1448
+ # interactive=False
1449
+ # )
1450
 
1451
  # Audio output for the bot's response
1452
  audio_output = gr.Audio(
 
1456
  visible=True
1457
  )
1458
 
1459
+ # # Status message for debugging
1460
+ # status_msg = gr.Textbox(
1461
+ # label="Status",
1462
+ # value="Ready",
1463
+ # interactive=False
1464
+ # )
1465
 
1466
  # Input area with separate components
1467
  with gr.Row(elem_classes=["input-area"]):