bluenevus commited on
Commit
180ce7d
·
verified ·
1 Parent(s): 92bd40b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -13
app.py CHANGED
@@ -141,9 +141,7 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
141
  return None
142
 
143
  try:
144
- # Load the intro/outro music
145
  music = AudioSegment.from_mp3("Maiko-intro-outro.mp3")
146
- # Resample music to 24000 Hz to match speech
147
  music = music.set_frame_rate(24000)
148
 
149
  progress(0.1, "Processing text...")
@@ -154,10 +152,7 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
154
  if not line.strip():
155
  continue
156
 
157
- if num_hosts == "2":
158
- voice = voice1 if i % 2 == 0 else voice2
159
- else:
160
- voice = voice1
161
 
162
  input_ids, attention_mask = process_prompt(line, voice, tokenizer, device)
163
 
@@ -182,10 +177,8 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
182
  line_audio = redistribute_codes(code_list, snac_model)
183
  audio_samples.append(line_audio)
184
 
185
- # Concatenate all audio samples
186
  final_audio = np.concatenate(audio_samples)
187
 
188
- # Convert numpy array to AudioSegment
189
  speech_audio = AudioSegment(
190
  final_audio.tobytes(),
191
  frame_rate=24000,
@@ -193,17 +186,13 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
193
  channels=1
194
  )
195
 
196
- # Combine intro, speech, and outro
197
  combined_audio = music + speech_audio + music
198
 
199
- # Convert back to numpy array
200
  combined_numpy = np.array(combined_audio.get_array_of_samples(), dtype=np.float32)
201
 
202
- # Normalize the audio
203
  combined_numpy = np.int16(combined_numpy / np.max(np.abs(combined_numpy)) * 32767)
204
 
205
- # Add a check for 15-second limitation
206
- max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
207
  if len(combined_numpy) > max_samples:
208
  combined_numpy = combined_numpy[:max_samples]
209
 
 
141
  return None
142
 
143
  try:
 
144
  music = AudioSegment.from_mp3("Maiko-intro-outro.mp3")
 
145
  music = music.set_frame_rate(24000)
146
 
147
  progress(0.1, "Processing text...")
 
152
  if not line.strip():
153
  continue
154
 
155
+ voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
 
 
 
156
 
157
  input_ids, attention_mask = process_prompt(line, voice, tokenizer, device)
158
 
 
177
  line_audio = redistribute_codes(code_list, snac_model)
178
  audio_samples.append(line_audio)
179
 
 
180
  final_audio = np.concatenate(audio_samples)
181
 
 
182
  speech_audio = AudioSegment(
183
  final_audio.tobytes(),
184
  frame_rate=24000,
 
186
  channels=1
187
  )
188
 
 
189
  combined_audio = music + speech_audio + music
190
 
 
191
  combined_numpy = np.array(combined_audio.get_array_of_samples(), dtype=np.float32)
192
 
 
193
  combined_numpy = np.int16(combined_numpy / np.max(np.abs(combined_numpy)) * 32767)
194
 
195
+ max_samples = 24000 * 15
 
196
  if len(combined_numpy) > max_samples:
197
  combined_numpy = combined_numpy[:max_samples]
198