ceymox commited on
Commit
3ddece1
·
verified ·
1 Parent(s): b097cb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -246
app.py CHANGED
@@ -628,275 +628,275 @@ class StreamingTTS:
628
  return self.output_file
629
  return None
630
 
631
- # class ConversationEngine:
632
- # def __init__(self):
633
- # self.conversation_history = []
634
- # self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
635
- # self.saved_voice = None
636
- # self.saved_voice_text = ""
637
- # self.tts_cache = {} # Cache for TTS outputs
638
 
639
- # # TTS background processing queue
640
- # self.tts_queue = queue.Queue()
641
- # self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
642
- # self.tts_thread.start()
643
-
644
- # # Initialize streaming TTS
645
- # self.streaming_tts = StreamingTTS()
646
-
647
- # def tts_worker(self):
648
- # """Background worker to process TTS requests"""
649
- # while True:
650
- # try:
651
- # # Get text and callback from queue
652
- # text, callback = self.tts_queue.get()
653
 
654
- # # Generate speech
655
- # audio_path = self._generate_tts(text)
656
 
657
- # # Execute callback with result
658
- # if callback:
659
- # callback(audio_path)
660
 
661
- # # Mark task as done
662
- # self.tts_queue.task_done()
663
- # except Exception as e:
664
- # print(f"Error in TTS worker: {e}")
665
- # traceback.print_exc()
666
-
667
- # def transcribe_audio(self, audio_data, language="ml-IN"):
668
- # """Convert audio to text using speech recognition"""
669
- # if audio_data is None:
670
- # print("No audio data received")
671
- # return "No audio detected", ""
672
-
673
- # # Make sure we have audio data in the expected format
674
- # try:
675
- # if isinstance(audio_data, tuple) and len(audio_data) == 2:
676
- # # Expected format: (sample_rate, audio_samples)
677
- # sample_rate, audio_samples = audio_data
678
- # else:
679
- # print(f"Unexpected audio format: {type(audio_data)}")
680
- # return "Invalid audio format", ""
681
-
682
- # if len(audio_samples) == 0:
683
- # print("Empty audio samples")
684
- # return "No speech detected", ""
685
-
686
- # # Save the audio temporarily
687
- # temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
688
- # temp_file.close()
689
-
690
- # # Save the audio data to the temporary file
691
- # sf.write(temp_file.name, audio_samples, sample_rate)
692
-
693
- # # Use speech recognition on the file
694
- # recognizer = sr.Recognizer()
695
- # with sr.AudioFile(temp_file.name) as source:
696
- # audio = recognizer.record(source)
697
-
698
- # text = recognizer.recognize_google(audio, language=language)
699
- # print(f"Recognized: {text}")
700
- # return text, text
701
-
702
- # except sr.UnknownValueError:
703
- # print("Speech recognition could not understand audio")
704
- # return "Could not understand audio", ""
705
- # except sr.RequestError as e:
706
- # print(f"Could not request results from Google Speech Recognition service: {e}")
707
- # return f"Speech recognition service error: {str(e)}", ""
708
- # except Exception as e:
709
- # print(f"Error processing audio: {e}")
710
- # traceback.print_exc()
711
- # return f"Error processing audio: {str(e)}", ""
712
- # finally:
713
- # # Clean up temporary file
714
- # if 'temp_file' in locals() and os.path.exists(temp_file.name):
715
- # try:
716
- # os.unlink(temp_file.name)
717
- # except Exception as e:
718
- # print(f"Error deleting temporary file: {e}")
719
-
720
- # def save_reference_voice(self, audio_data, reference_text):
721
- # """Save the reference voice for future TTS generation"""
722
- # if audio_data is None or not reference_text.strip():
723
- # return "Error: Both reference audio and text are required"
724
-
725
- # self.saved_voice = audio_data
726
- # self.saved_voice_text = reference_text.strip()
727
 
728
- # # Clear TTS cache when voice changes
729
- # self.tts_cache.clear()
730
 
731
- # # Debug info
732
- # sample_rate, audio_samples = audio_data
733
- # print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
734
- # print(f"Reference text: {reference_text}")
735
 
736
- # return f"Voice saved successfully! Reference text: {reference_text}"
737
-
738
- # def process_text_input(self, text):
739
- # """Process text input from user"""
740
- # if text and text.strip():
741
- # return text, text
742
- # return "No input provided", ""
743
-
744
- # def generate_response(self, input_text):
745
- # """Generate AI response using GPT-3.5 Turbo"""
746
- # if not input_text or not input_text.strip():
747
- # return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None # "No input received. Please try again."
748
-
749
- # try:
750
- # # Prepare conversation context from history
751
- # messages = [{"role": "system", "content": self.system_prompt}]
752
-
753
- # # Add previous conversations for context
754
- # for entry in self.conversation_history:
755
- # role = "user" if entry["role"] == "user" else "assistant"
756
- # messages.append({"role": role, "content": entry["content"]})
757
-
758
- # # Add current input
759
- # messages.append({"role": "user", "content": input_text})
760
-
761
- # # Call OpenAI API
762
- # response = openai.ChatCompletion.create(
763
- # model="gpt-3.5-turbo",
764
- # messages=messages,
765
- # max_tokens=500,
766
- # temperature=0.7
767
- # )
768
-
769
- # response_text = response.choices[0].message["content"].strip()
770
- # return response_text, None
771
-
772
- # except Exception as e:
773
- # error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
774
- # print(f"Error in GPT response: {e}")
775
- # traceback.print_exc()
776
- # return error_msg, None
777
-
778
- # def resample_audio(self, audio, orig_sr, target_sr):
779
- # """Resample audio to match target sample rate only if necessary"""
780
- # if orig_sr != target_sr:
781
- # print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
782
- # return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
783
- # return audio
784
-
785
- # def _generate_tts(self, text):
786
- # """Internal method to generate TTS without threading"""
787
- # if not text or not text.strip():
788
- # print("No text provided for TTS generation")
789
- # return None
790
 
791
- # # Check cache first
792
- # if text in self.tts_cache:
793
- # print("Using cached TTS output")
794
- # return self.tts_cache[text]
795
-
796
- # try:
797
- # # Check if we have a saved voice and the TTS model
798
- # if self.saved_voice is not None and tts_model is not None:
799
- # sample_rate, audio_data = self.saved_voice
800
 
801
- # # Create a temporary file for the reference audio
802
- # ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
803
- # ref_temp_file.close()
804
- # print(f"Saving reference audio to {ref_temp_file.name}")
805
 
806
- # # Save the reference audio data
807
- # sf.write(ref_temp_file.name, audio_data, sample_rate)
808
-
809
- # # Create a temporary file for the output audio
810
- # output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
811
- # output_temp_file.close()
812
-
813
- # try:
814
- # # Generate speech using IndicF5 - simplified approach from second file
815
- # print(f"Generating speech with IndicF5. Text: {text[:30]}...")
816
- # start_time = time.time()
817
-
818
- # # Use torch.no_grad() to save memory and computation
819
- # with torch.no_grad():
820
- # # Run the inference using the wrapper
821
- # synth_audio = tts_model_wrapper.generate(
822
- # text,
823
- # ref_audio_path=ref_temp_file.name,
824
- # ref_text=self.saved_voice_text
825
- # )
826
 
827
- # end_time = time.time()
828
- # print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
829
 
830
- # # Process audio for better quality
831
- # synth_audio = enhance_audio(synth_audio)
832
 
833
- # # Save the synthesized audio
834
- # sf.write(output_temp_file.name, synth_audio, 24000) # IndicF5 uses 24kHz
835
 
836
- # # Add to cache
837
- # self.tts_cache[text] = output_temp_file.name
838
 
839
- # print(f"TTS output saved to {output_temp_file.name}")
840
- # return output_temp_file.name
841
 
842
- # except Exception as e:
843
- # print(f"Error generating speech: {e}")
844
- # traceback.print_exc()
845
- # return None
846
- # finally:
847
- # # We don't delete the output file as it's returned to the caller
848
- # # But clean up reference file
849
- # try:
850
- # os.unlink(ref_temp_file.name)
851
- # except Exception as e:
852
- # print(f"Error cleaning up reference file: {e}")
853
- # else:
854
- # print("No saved voice reference or TTS model not loaded")
855
- # return None
856
- # except Exception as e:
857
- # print(f"Error in TTS processing: {e}")
858
- # traceback.print_exc()
859
- # return None
860
-
861
- # def queue_tts_generation(self, text, callback=None):
862
- # """Queue TTS generation in background thread"""
863
- # print(f"Queueing TTS generation for text: {text[:30]}...")
864
- # self.tts_queue.put((text, callback))
865
-
866
- # def generate_streamed_speech(self, text):
867
- # """Generate speech in a streaming manner for low latency"""
868
- # if not self.saved_voice:
869
- # print("No reference voice saved")
870
- # return None
871
 
872
- # if not text or not text.strip():
873
- # print("No text provided for streaming TTS")
874
- # return None
875
 
876
- # sample_rate, audio_data = self.saved_voice
877
 
878
- # # Start streaming generation
879
- # self.streaming_tts.generate(
880
- # text=text,
881
- # ref_audio=audio_data,
882
- # ref_sr=sample_rate,
883
- # ref_text=self.saved_voice_text
884
- # )
885
 
886
- # # Return the path that will be populated
887
- # return self.streaming_tts.output_file
888
 
889
- # def update_history(self, user_input, ai_response):
890
- # """Update conversation history"""
891
- # if user_input and user_input.strip():
892
- # self.conversation_history.append({"role": "user", "content": user_input})
893
 
894
- # if ai_response and ai_response.strip():
895
- # self.conversation_history.append({"role": "assistant", "content": ai_response})
896
 
897
- # # Limit history size
898
- # if len(self.conversation_history) > 20:
899
- # self.conversation_history = self.conversation_history[-20:]
900
 
901
  # Initialize global conversation engine
902
  conversation_engine = ConversationEngine()
 
628
  return self.output_file
629
  return None
630
 
631
+ class ConversationEngine:
632
+ def __init__(self):
633
+ self.conversation_history = []
634
+ self.system_prompt = "You are a helpful assistant that speaks Malayalam fluently. Always respond in Malayalam script with proper formatting."
635
+ self.saved_voice = None
636
+ self.saved_voice_text = ""
637
+ self.tts_cache = {} # Cache for TTS outputs
638
 
639
+ # TTS background processing queue
640
+ self.tts_queue = queue.Queue()
641
+ self.tts_thread = threading.Thread(target=self.tts_worker, daemon=True)
642
+ self.tts_thread.start()
643
+
644
+ # Initialize streaming TTS
645
+ self.streaming_tts = StreamingTTS()
646
+
647
+ def tts_worker(self):
648
+ """Background worker to process TTS requests"""
649
+ while True:
650
+ try:
651
+ # Get text and callback from queue
652
+ text, callback = self.tts_queue.get()
653
 
654
+ # Generate speech
655
+ audio_path = self._generate_tts(text)
656
 
657
+ # Execute callback with result
658
+ if callback:
659
+ callback(audio_path)
660
 
661
+ # Mark task as done
662
+ self.tts_queue.task_done()
663
+ except Exception as e:
664
+ print(f"Error in TTS worker: {e}")
665
+ traceback.print_exc()
666
+
667
+ def transcribe_audio(self, audio_data, language="ml-IN"):
668
+ """Convert audio to text using speech recognition"""
669
+ if audio_data is None:
670
+ print("No audio data received")
671
+ return "No audio detected", ""
672
+
673
+ # Make sure we have audio data in the expected format
674
+ try:
675
+ if isinstance(audio_data, tuple) and len(audio_data) == 2:
676
+ # Expected format: (sample_rate, audio_samples)
677
+ sample_rate, audio_samples = audio_data
678
+ else:
679
+ print(f"Unexpected audio format: {type(audio_data)}")
680
+ return "Invalid audio format", ""
681
+
682
+ if len(audio_samples) == 0:
683
+ print("Empty audio samples")
684
+ return "No speech detected", ""
685
+
686
+ # Save the audio temporarily
687
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
688
+ temp_file.close()
689
+
690
+ # Save the audio data to the temporary file
691
+ sf.write(temp_file.name, audio_samples, sample_rate)
692
+
693
+ # Use speech recognition on the file
694
+ recognizer = sr.Recognizer()
695
+ with sr.AudioFile(temp_file.name) as source:
696
+ audio = recognizer.record(source)
697
+
698
+ text = recognizer.recognize_google(audio, language=language)
699
+ print(f"Recognized: {text}")
700
+ return text, text
701
+
702
+ except sr.UnknownValueError:
703
+ print("Speech recognition could not understand audio")
704
+ return "Could not understand audio", ""
705
+ except sr.RequestError as e:
706
+ print(f"Could not request results from Google Speech Recognition service: {e}")
707
+ return f"Speech recognition service error: {str(e)}", ""
708
+ except Exception as e:
709
+ print(f"Error processing audio: {e}")
710
+ traceback.print_exc()
711
+ return f"Error processing audio: {str(e)}", ""
712
+ finally:
713
+ # Clean up temporary file
714
+ if 'temp_file' in locals() and os.path.exists(temp_file.name):
715
+ try:
716
+ os.unlink(temp_file.name)
717
+ except Exception as e:
718
+ print(f"Error deleting temporary file: {e}")
719
+
720
+ def save_reference_voice(self, audio_data, reference_text):
721
+ """Save the reference voice for future TTS generation"""
722
+ if audio_data is None or not reference_text.strip():
723
+ return "Error: Both reference audio and text are required"
724
+
725
+ self.saved_voice = audio_data
726
+ self.saved_voice_text = reference_text.strip()
727
 
728
+ # Clear TTS cache when voice changes
729
+ self.tts_cache.clear()
730
 
731
+ # Debug info
732
+ sample_rate, audio_samples = audio_data
733
+ print(f"Saved reference voice: {len(audio_samples)} samples at {sample_rate}Hz")
734
+ print(f"Reference text: {reference_text}")
735
 
736
+ return f"Voice saved successfully! Reference text: {reference_text}"
737
+
738
+ def process_text_input(self, text):
739
+ """Process text input from user"""
740
+ if text and text.strip():
741
+ return text, text
742
+ return "No input provided", ""
743
+
744
+ def generate_response(self, input_text):
745
+ """Generate AI response using GPT-3.5 Turbo"""
746
+ if not input_text or not input_text.strip():
747
+ return "ഇൻപുട്ട് ലഭിച്ചില്ല. വീണ്ടും ശ്രമിക്കുക.", None # "No input received. Please try again."
748
+
749
+ try:
750
+ # Prepare conversation context from history
751
+ messages = [{"role": "system", "content": self.system_prompt}]
752
+
753
+ # Add previous conversations for context
754
+ for entry in self.conversation_history:
755
+ role = "user" if entry["role"] == "user" else "assistant"
756
+ messages.append({"role": role, "content": entry["content"]})
757
+
758
+ # Add current input
759
+ messages.append({"role": "user", "content": input_text})
760
+
761
+ # Call OpenAI API
762
+ response = openai.ChatCompletion.create(
763
+ model="gpt-3.5-turbo",
764
+ messages=messages,
765
+ max_tokens=500,
766
+ temperature=0.7
767
+ )
768
+
769
+ response_text = response.choices[0].message["content"].strip()
770
+ return response_text, None
771
+
772
+ except Exception as e:
773
+ error_msg = f"എറർ: GPT മോഡലിൽ നിന്ന് ഉത്തരം ലഭിക്കുന്നതിൽ പ്രശ്നമുണ്ടായി: {str(e)}"
774
+ print(f"Error in GPT response: {e}")
775
+ traceback.print_exc()
776
+ return error_msg, None
777
+
778
+ def resample_audio(self, audio, orig_sr, target_sr):
779
+ """Resample audio to match target sample rate only if necessary"""
780
+ if orig_sr != target_sr:
781
+ print(f"Resampling audio from {orig_sr}Hz to {target_sr}Hz")
782
+ return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
783
+ return audio
784
+
785
+ def _generate_tts(self, text):
786
+ """Internal method to generate TTS without threading"""
787
+ if not text or not text.strip():
788
+ print("No text provided for TTS generation")
789
+ return None
790
 
791
+ # Check cache first
792
+ if text in self.tts_cache:
793
+ print("Using cached TTS output")
794
+ return self.tts_cache[text]
795
+
796
+ try:
797
+ # Check if we have a saved voice and the TTS model
798
+ if self.saved_voice is not None and tts_model is not None:
799
+ sample_rate, audio_data = self.saved_voice
800
 
801
+ # Create a temporary file for the reference audio
802
+ ref_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
803
+ ref_temp_file.close()
804
+ print(f"Saving reference audio to {ref_temp_file.name}")
805
 
806
+ # Save the reference audio data
807
+ sf.write(ref_temp_file.name, audio_data, sample_rate)
808
+
809
+ # Create a temporary file for the output audio
810
+ output_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
811
+ output_temp_file.close()
812
+
813
+ try:
814
+ # Generate speech using IndicF5 - simplified approach from second file
815
+ print(f"Generating speech with IndicF5. Text: {text[:30]}...")
816
+ start_time = time.time()
817
+
818
+ # Use torch.no_grad() to save memory and computation
819
+ with torch.no_grad():
820
+ # Run the inference using the wrapper
821
+ synth_audio = tts_model_wrapper.generate(
822
+ text,
823
+ ref_audio_path=ref_temp_file.name,
824
+ ref_text=self.saved_voice_text
825
+ )
826
 
827
+ end_time = time.time()
828
+ print(f"Speech generation completed in {end_time - start_time:.2f} seconds")
829
 
830
+ # Process audio for better quality
831
+ synth_audio = enhance_audio(synth_audio)
832
 
833
+ # Save the synthesized audio
834
+ sf.write(output_temp_file.name, synth_audio, 24000) # IndicF5 uses 24kHz
835
 
836
+ # Add to cache
837
+ self.tts_cache[text] = output_temp_file.name
838
 
839
+ print(f"TTS output saved to {output_temp_file.name}")
840
+ return output_temp_file.name
841
 
842
+ except Exception as e:
843
+ print(f"Error generating speech: {e}")
844
+ traceback.print_exc()
845
+ return None
846
+ finally:
847
+ # We don't delete the output file as it's returned to the caller
848
+ # But clean up reference file
849
+ try:
850
+ os.unlink(ref_temp_file.name)
851
+ except Exception as e:
852
+ print(f"Error cleaning up reference file: {e}")
853
+ else:
854
+ print("No saved voice reference or TTS model not loaded")
855
+ return None
856
+ except Exception as e:
857
+ print(f"Error in TTS processing: {e}")
858
+ traceback.print_exc()
859
+ return None
860
+
861
+ def queue_tts_generation(self, text, callback=None):
862
+ """Queue TTS generation in background thread"""
863
+ print(f"Queueing TTS generation for text: {text[:30]}...")
864
+ self.tts_queue.put((text, callback))
865
+
866
+ def generate_streamed_speech(self, text):
867
+ """Generate speech in a streaming manner for low latency"""
868
+ if not self.saved_voice:
869
+ print("No reference voice saved")
870
+ return None
871
 
872
+ if not text or not text.strip():
873
+ print("No text provided for streaming TTS")
874
+ return None
875
 
876
+ sample_rate, audio_data = self.saved_voice
877
 
878
+ # Start streaming generation
879
+ self.streaming_tts.generate(
880
+ text=text,
881
+ ref_audio=audio_data,
882
+ ref_sr=sample_rate,
883
+ ref_text=self.saved_voice_text
884
+ )
885
 
886
+ # Return the path that will be populated
887
+ return self.streaming_tts.output_file
888
 
889
+ def update_history(self, user_input, ai_response):
890
+ """Update conversation history"""
891
+ if user_input and user_input.strip():
892
+ self.conversation_history.append({"role": "user", "content": user_input})
893
 
894
+ if ai_response and ai_response.strip():
895
+ self.conversation_history.append({"role": "assistant", "content": ai_response})
896
 
897
+ # Limit history size
898
+ if len(self.conversation_history) > 20:
899
+ self.conversation_history = self.conversation_history[-20:]
900
 
901
  # Initialize global conversation engine
902
  conversation_engine = ConversationEngine()