hashhac commited on
Commit
519f37a
·
1 Parent(s): 23c481b
Files changed (1) hide show
  1. app.py +32 -31
app.py CHANGED
@@ -62,16 +62,20 @@ def load_llm_model():
62
 
63
  return model, tokenizer
64
 
 
65
  # Step 3: Text-to-Speech with a free model
66
  def load_tts_model():
 
 
 
67
  model_id = "microsoft/speecht5_tts"
68
- processor = AutoProcessor.from_pretrained(model_id)
69
- model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
70
  model.to(device)
71
 
72
  # Load vocoder for waveform generation
73
  vocoder_id = "microsoft/speecht5_hifigan"
74
- vocoder = AutoModelForCausalLM.from_pretrained(vocoder_id)
75
  vocoder.to(device)
76
 
77
  # Load speaker embeddings
@@ -80,6 +84,30 @@ def load_tts_model():
80
 
81
  return model, processor, vocoder, speaker_embeddings
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # Initialize all models
84
  print("Loading ASR model...")
85
  asr_pipeline = load_asr_model()
@@ -137,34 +165,7 @@ def generate_response(prompt):
137
 
138
  return response_text
139
 
140
- def text_to_speech(text):
141
- # Prepare inputs
142
- inputs = tts_processor(text=text, return_tensors="pt")
143
-
144
- # Add speaker embeddings
145
- inputs["speaker_embeddings"] = speaker_embeddings.to(device)
146
-
147
- # Generate speech
148
- with torch.no_grad():
149
- speech = tts_model.generate_speech(
150
- inputs["input_ids"].to(device),
151
- speaker_embeddings.to(device)
152
- )
153
-
154
- # Convert to waveform using vocoder
155
- with torch.no_grad():
156
- waveform = tts_vocoder(speech)
157
-
158
- # Convert to numpy array
159
- audio_array = waveform.cpu().numpy().squeeze()
160
-
161
- # Normalize and convert to int16
162
- audio_array = (audio_array / np.max(np.abs(audio_array)) * 32767).astype(np.int16)
163
-
164
- # Reshape for fastrtc
165
- audio_array = audio_array.reshape(1, -1)
166
-
167
- return (24000, audio_array) # Using 24kHz sample rate
168
 
169
  def response(audio: tuple[int, np.ndarray]):
170
  # Step 1: Speech-to-Text
 
62
 
63
  return model, tokenizer
64
 
65
+ # Step 3: Text-to-Speech with a free model
66
  # Step 3: Text-to-Speech with a free model
67
  def load_tts_model():
68
+ # Import the specific SpeechT5 classes
69
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
70
+
71
  model_id = "microsoft/speecht5_tts"
72
+ processor = SpeechT5Processor.from_pretrained(model_id)
73
+ model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
74
  model.to(device)
75
 
76
  # Load vocoder for waveform generation
77
  vocoder_id = "microsoft/speecht5_hifigan"
78
+ vocoder = SpeechT5HifiGan.from_pretrained(vocoder_id)
79
  vocoder.to(device)
80
 
81
  # Load speaker embeddings
 
84
 
85
  return model, processor, vocoder, speaker_embeddings
86
 
87
+ def text_to_speech(text):
88
+ # Prepare inputs
89
+ inputs = tts_processor(text=text, return_tensors="pt")
90
+
91
+ # Generate speech with SpeechT5
92
+ with torch.no_grad():
93
+ # Generate speech
94
+ speech = tts_model.generate_speech(
95
+ inputs["input_ids"].to(device),
96
+ speaker_embeddings.to(device),
97
+ vocoder=tts_vocoder
98
+ )
99
+
100
+ # Convert to numpy array
101
+ audio_array = speech.cpu().numpy()
102
+
103
+ # Normalize and convert to int16
104
+ audio_array = (audio_array / np.max(np.abs(audio_array)) * 32767).astype(np.int16)
105
+
106
+ # Reshape for fastrtc
107
+ audio_array = audio_array.reshape(1, -1)
108
+
109
+ return (16000, audio_array) # SpeechT5 uses 16kHz sample rate
110
+
111
  # Initialize all models
112
  print("Loading ASR model...")
113
  asr_pipeline = load_asr_model()
 
165
 
166
  return response_text
167
 
168
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  def response(audio: tuple[int, np.ndarray]):
171
  # Step 1: Speech-to-Text