Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -85,32 +85,35 @@ def generateAudio(text_to_audio, s3_save_as):
|
|
85 |
def save_text_to_speech(text, speaker=None):
|
86 |
# Preprocess text and recortar
|
87 |
text = cut_text(text, max_tokens=500)
|
88 |
-
# preprocess text
|
89 |
-
inputs = processor(text=text, return_tensors="pt").to(device)
|
90 |
-
if speaker is not None:
|
91 |
-
# load xvector containing speaker's voice characteristics from a dataset
|
92 |
-
speaker_embeddings = torch.tensor(
|
93 |
-
embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
|
94 |
-
else:
|
95 |
-
# random vector, meaning a random voice
|
96 |
-
speaker_embeddings = torch.randn((1, 512)).to(device)
|
97 |
-
# generate speech with the models
|
98 |
-
speech = model.generate_speech(
|
99 |
-
inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
100 |
-
|
101 |
|
102 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
audio_buffer = BytesIO()
|
104 |
-
|
105 |
-
sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000, format='WAV')
|
106 |
audio_buffer.seek(0)
|
107 |
|
108 |
-
#
|
109 |
save_audio_to_s3(audio_buffer)
|
110 |
|
111 |
save_text_to_speech(text_to_audio, 2271)
|
112 |
return s3_save_as
|
113 |
|
114 |
|
115 |
-
iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url")], outputs="text")
|
116 |
iface.launch()
|
|
|
|
85 |
def save_text_to_speech(text, speaker=None):
|
86 |
# Preprocess text and recortar
|
87 |
text = cut_text(text, max_tokens=500)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
+
# Divide el texto en segmentos de 100 caracteres
|
90 |
+
segmentos = [text[i:i+100] for i in range(0, len(text), 100)]
|
91 |
+
|
92 |
+
# Generar audio para cada segmento y combinarlos
|
93 |
+
audio_segments = []
|
94 |
+
for segment in segmentos:
|
95 |
+
inputs = processor(text=segment, return_tensors="pt").to(device)
|
96 |
+
if speaker is not None:
|
97 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
|
98 |
+
else:
|
99 |
+
speaker_embeddings = torch.randn((1, 512)).to(device)
|
100 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
101 |
+
audio_segments.append(speech)
|
102 |
+
|
103 |
+
combined_audio = torch.cat(audio_segments, dim=1)
|
104 |
+
|
105 |
+
# Crear objeto BytesIO para almacenar el audio
|
106 |
audio_buffer = BytesIO()
|
107 |
+
sf.write(audio_buffer, combined_audio.cpu().numpy(), samplerate=16000, format='WAV')
|
|
|
108 |
audio_buffer.seek(0)
|
109 |
|
110 |
+
# Guardar el audio combinado en S3
|
111 |
save_audio_to_s3(audio_buffer)
|
112 |
|
113 |
save_text_to_speech(text_to_audio, 2271)
|
114 |
return s3_save_as
|
115 |
|
116 |
|
117 |
+
iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url")], outputs="text", title="Text-to-Audio")
|
118 |
iface.launch()
|
119 |
+
|