Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -141,9 +141,7 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
| 141 |
return None
|
| 142 |
|
| 143 |
try:
|
| 144 |
-
# Load the intro/outro music
|
| 145 |
music = AudioSegment.from_mp3("Maiko-intro-outro.mp3")
|
| 146 |
-
# Resample music to 24000 Hz to match speech
|
| 147 |
music = music.set_frame_rate(24000)
|
| 148 |
|
| 149 |
progress(0.1, "Processing text...")
|
|
@@ -154,10 +152,7 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
| 154 |
if not line.strip():
|
| 155 |
continue
|
| 156 |
|
| 157 |
-
if num_hosts == "2
|
| 158 |
-
voice = voice1 if i % 2 == 0 else voice2
|
| 159 |
-
else:
|
| 160 |
-
voice = voice1
|
| 161 |
|
| 162 |
input_ids, attention_mask = process_prompt(line, voice, tokenizer, device)
|
| 163 |
|
|
@@ -182,10 +177,8 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
| 182 |
line_audio = redistribute_codes(code_list, snac_model)
|
| 183 |
audio_samples.append(line_audio)
|
| 184 |
|
| 185 |
-
# Concatenate all audio samples
|
| 186 |
final_audio = np.concatenate(audio_samples)
|
| 187 |
|
| 188 |
-
# Convert numpy array to AudioSegment
|
| 189 |
speech_audio = AudioSegment(
|
| 190 |
final_audio.tobytes(),
|
| 191 |
frame_rate=24000,
|
|
@@ -193,17 +186,13 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
| 193 |
channels=1
|
| 194 |
)
|
| 195 |
|
| 196 |
-
# Combine intro, speech, and outro
|
| 197 |
combined_audio = music + speech_audio + music
|
| 198 |
|
| 199 |
-
# Convert back to numpy array
|
| 200 |
combined_numpy = np.array(combined_audio.get_array_of_samples(), dtype=np.float32)
|
| 201 |
|
| 202 |
-
# Normalize the audio
|
| 203 |
combined_numpy = np.int16(combined_numpy / np.max(np.abs(combined_numpy)) * 32767)
|
| 204 |
|
| 205 |
-
|
| 206 |
-
max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
|
| 207 |
if len(combined_numpy) > max_samples:
|
| 208 |
combined_numpy = combined_numpy[:max_samples]
|
| 209 |
|
|
|
|
| 141 |
return None
|
| 142 |
|
| 143 |
try:
|
|
|
|
| 144 |
music = AudioSegment.from_mp3("Maiko-intro-outro.mp3")
|
|
|
|
| 145 |
music = music.set_frame_rate(24000)
|
| 146 |
|
| 147 |
progress(0.1, "Processing text...")
|
|
|
|
| 152 |
if not line.strip():
|
| 153 |
continue
|
| 154 |
|
| 155 |
+
voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
input_ids, attention_mask = process_prompt(line, voice, tokenizer, device)
|
| 158 |
|
|
|
|
| 177 |
line_audio = redistribute_codes(code_list, snac_model)
|
| 178 |
audio_samples.append(line_audio)
|
| 179 |
|
|
|
|
| 180 |
final_audio = np.concatenate(audio_samples)
|
| 181 |
|
|
|
|
| 182 |
speech_audio = AudioSegment(
|
| 183 |
final_audio.tobytes(),
|
| 184 |
frame_rate=24000,
|
|
|
|
| 186 |
channels=1
|
| 187 |
)
|
| 188 |
|
|
|
|
| 189 |
combined_audio = music + speech_audio + music
|
| 190 |
|
|
|
|
| 191 |
combined_numpy = np.array(combined_audio.get_array_of_samples(), dtype=np.float32)
|
| 192 |
|
|
|
|
| 193 |
combined_numpy = np.int16(combined_numpy / np.max(np.abs(combined_numpy)) * 32767)
|
| 194 |
|
| 195 |
+
max_samples = 24000 * 15
|
|
|
|
| 196 |
if len(combined_numpy) > max_samples:
|
| 197 |
combined_numpy = combined_numpy[:max_samples]
|
| 198 |
|