seayala commited on
Commit
447df5c
verified
1 Parent(s): 0e42d56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -10
app.py CHANGED
@@ -2,6 +2,7 @@ import torch
2
  import gradio as gr
3
  import soundfile as sf
4
  import tempfile
 
5
 
6
  from transformers import (
7
  SpeechT5Processor,
@@ -9,35 +10,30 @@ from transformers import (
9
  SpeechT5HifiGan
10
  )
11
  from datasets import load_dataset
12
- import librosa
13
 
14
- # Carga de modelos y recursos
15
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
16
  model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
17
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
 
19
- # Speaker embeddings fijos (puedes extender a selecci贸n en el UI)
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
23
  # Funci贸n principal
24
  def voice_conversion(audio_file):
25
- # Cargar audio con librosa
26
- audio, sr = librosa.load(audio_file, sr=16000) # aseg煤rate de que est茅 en 16kHz
27
  inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
28
-
29
  with torch.no_grad():
30
  speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
31
-
32
- # Guardar en archivo temporal
33
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
34
  sf.write(f.name, speech.numpy(), samplerate=16000)
35
  return f.name
36
 
37
- # Interfaz Gradio
38
  interface = gr.Interface(
39
  fn=voice_conversion,
40
- inputs=gr.Audio(source="upload", type="filepath", label="Sube un audio (voz hablada)"),
41
  outputs=gr.Audio(type="filepath", label="Voz convertida"),
42
  title="SpeechT5 Voice Conversion",
43
  description="Convierte una voz hablada en otra con SpeechT5 de Microsoft"
 
2
  import gradio as gr
3
  import soundfile as sf
4
  import tempfile
5
+ import librosa
6
 
7
  from transformers import (
8
  SpeechT5Processor,
 
10
  SpeechT5HifiGan
11
  )
12
  from datasets import load_dataset
 
13
 
14
+ # Modelos
15
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
16
  model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
17
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
 
19
+ # Embeddings de voz
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
23
  # Funci贸n principal
24
  def voice_conversion(audio_file):
25
+ audio, sr = librosa.load(audio_file, sr=16000)
 
26
  inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
 
27
  with torch.no_grad():
28
  speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
 
 
29
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
30
  sf.write(f.name, speech.numpy(), samplerate=16000)
31
  return f.name
32
 
33
+ # Interfaz
34
  interface = gr.Interface(
35
  fn=voice_conversion,
36
+ inputs=gr.Audio(type="filepath", label="Sube un audio (voz hablada)"),
37
  outputs=gr.Audio(type="filepath", label="Voz convertida"),
38
  title="SpeechT5 Voice Conversion",
39
  description="Convierte una voz hablada en otra con SpeechT5 de Microsoft"