SAUL19 commited on
Commit
e50afa4
·
1 Parent(s): 98757a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -18
app.py CHANGED
@@ -85,32 +85,35 @@ def generateAudio(text_to_audio, s3_save_as):
85
  def save_text_to_speech(text, speaker=None):
86
  # Preprocess text and recortar
87
  text = cut_text(text, max_tokens=500)
88
- # preprocess text
89
- inputs = processor(text=text, return_tensors="pt").to(device)
90
- if speaker is not None:
91
- # load xvector containing speaker's voice characteristics from a dataset
92
- speaker_embeddings = torch.tensor(
93
- embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
94
- else:
95
- # random vector, meaning a random voice
96
- speaker_embeddings = torch.randn((1, 512)).to(device)
97
- # generate speech with the models
98
- speech = model.generate_speech(
99
- inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
100
-
101
 
102
- # create BytesIO object to store the audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  audio_buffer = BytesIO()
104
- # save the generated speech to the BytesIO buffer
105
- sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000, format='WAV')
106
  audio_buffer.seek(0)
107
 
108
- # Save the audio to S3
109
  save_audio_to_s3(audio_buffer)
110
 
111
  save_text_to_speech(text_to_audio, 2271)
112
  return s3_save_as
113
 
114
 
115
- iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url")], outputs="text")
116
  iface.launch()
 
 
85
  def save_text_to_speech(text, speaker=None):
86
  # Preprocess text and recortar
87
  text = cut_text(text, max_tokens=500)
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # Divide el texto en segmentos de 100 caracteres
90
+ segmentos = [text[i:i+100] for i in range(0, len(text), 100)]
91
+
92
+ # Generar audio para cada segmento y combinarlos
93
+ audio_segments = []
94
+ for segment in segmentos:
95
+ inputs = processor(text=segment, return_tensors="pt").to(device)
96
+ if speaker is not None:
97
+ speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
98
+ else:
99
+ speaker_embeddings = torch.randn((1, 512)).to(device)
100
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
101
+ audio_segments.append(speech)
102
+
103
+ combined_audio = torch.cat(audio_segments, dim=1)
104
+
105
+ # Crear objeto BytesIO para almacenar el audio
106
  audio_buffer = BytesIO()
107
+ sf.write(audio_buffer, combined_audio.cpu().numpy(), samplerate=16000, format='WAV')
 
108
  audio_buffer.seek(0)
109
 
110
+ # Guardar el audio combinado en S3
111
  save_audio_to_s3(audio_buffer)
112
 
113
  save_text_to_speech(text_to_audio, 2271)
114
  return s3_save_as
115
 
116
 
117
+ iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url")], outputs="text", title="Text-to-Audio")
118
  iface.launch()
119
+