suprimedev commited on
Commit
89d2d90
·
verified ·
1 Parent(s): 94c2b30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -312
app.py CHANGED
@@ -1,326 +1,109 @@
1
  import gradio as gr
2
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
- from datasets import load_dataset
4
- import torch
5
  import torchaudio
6
- import soundfile as sf
 
 
 
 
7
  import os
 
8
 
9
- # Load models
10
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
12
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
-
14
- # Load speaker embeddings (we'll use a sample speaker for simplicity)
15
- # For real voice cloning, you'd extract embeddings from your target audio.
16
- # For a quick demo, we'll use a pre-defined one from a dataset.
17
- # This is just an example, a robust voice cloner would extract embeddings directly.
18
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
19
-
20
- # You need to choose an appropriate speaker embedding.
21
- # For a general solution, you'd typically process an *encoding* of the target voice.
22
- # Let's take a sample speaker embedding. In a real application,
23
- # you'd extract this from the second audio file.
24
- # As a placeholder, we'll use a pre-existing one.
25
- # For true voice cloning with the second audio, you'd need a model
26
- # that can extract speaker embeddings from arbitrary audio.
27
- # SpeechT5 itself doesn't directly provide a pre-trained model for this from an arbitrary audio.
28
- # Typically, you'd use a separate speaker embedding extraction model (e.g., from ECAPA-TDNN).
29
- # For this example, let's use a placeholder.
30
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
31
-
32
-
33
- def voice_clone(text_audio_path, voice_audio_path):
34
- """
35
- Clones the voice from voice_audio_path to speak the content of text_audio_path.
36
- (Note: This implementation primarily uses text from text_audio_path and applies a generic voice.
37
- True "cloning" from the *content* of text_audio_path while applying the *style*
38
- of voice_audio_path for arbitrary content is more advanced.
39
- Here, we assume text_audio_path provides the text to be spoken,
40
- and voice_audio_path provides the *target voice characteristics*.)
41
- """
42
- try:
43
- # 1. Read the audio file where we want to extract the content (text or speech)
44
- # For simplicity, we'll assume the text_audio_path contains speech that we want to transcribe
45
- # and then re-synthesize in the style of voice_audio_path.
46
- # NOTE: SpeechT5 is primarily Text-to-Speech. To get text from text_audio_path,
47
- # you would need an ASR (Automatic Speech Recognition) model.
48
- # For a simpler demo, let's assume text_audio_path *could be transcribed*
49
- # or directly provides the text.
50
-
51
- # For this example, let's assume `text_audio_path` is the source of the *text*
52
- # and `voice_audio_path` is the source of the *voice characteristics*.
53
-
54
- # Step 1: Read the audio file to get the content/text (A real implementation would need ASR here)
55
- # As a placeholder, let's just make up some text or simplify.
56
- # A more robust solution would involve:
57
- # a) ASR on `text_audio_path` to get the text.
58
- # b) Feature extraction from `voice_audio_path` to get an accurate speaker embedding.
59
-
60
- # For now, let's simplify and make a strong assumption:
61
- # The user provides a text content *implicitly* via the first audio.
62
- # For ASR, we'd need another model. Let's make an assumption for the demo:
63
- # we will use a fixed text, and apply the speaker embedding from `voice_audio_path`.
64
- # THIS IS NOT TRUE VOICE CLONING OF *CONTENT* + *VOICE*.
65
- # It's more Text-to-Speech with a specific speaker.
66
-
67
- # Let's say, for demonstration, we will let the user type the text,
68
- # OR pretend to extract text from `text_audio_path` and use a generic speaker embedding
69
- # if we cannot extract accurate speaker embeddings from arbitrary audio using SpeechT5 directly.
70
-
71
- # A more practical approach for your request:
72
- # 1. User provides "source audio" (audio_1). We want to extract *content/text* from it.
73
- # 2. User provides "target voice audio" (audio_2). We want to extract *speaker identity* from it.
74
- # 3. Synthesize the extracted text with the extracted speaker identity.
75
-
76
- # SpeechT5 doesn't have a direct "extract speaker embedding from arbitrary audio" function
77
- # built into its `SpeechT5Processor`. You'd typically use a separate model for that (e.g., ECAPA-TDNN).
78
- # For the sake of this demo with SpeechT5, we will use a pre-extracted speaker embedding.
79
- # This means true "cloning the voice *from* the second file" (if that file is arbitrary)
80
- # is complex.
81
-
82
- # Let's refine the approach for a Hugging Face model combo:
83
- # - Use a robust ASR model (e.g., Whisper) for the first audio to get text.
84
- # - Use a speaker embedding model (e.g., from pyannote.audio or similar) for the second audio.
85
- # - Use SpeechT5 for TTS.
86
-
87
- # Given the prompt, "تقلید صدای فایل اول را از فایل دوم", it implies:
88
- # File 1: Provides the *content* (what is said).
89
- # File 2: Provides the *voice style/identity*.
90
 
91
- # For a simplified demo *without* an ASR model and a dedicated speaker embedding extractor:
92
- # We will prompt the user for the text they want to say.
93
- # The second audio will be *assumed* to be able to provide a speaker embedding.
94
-
95
- # For a robust solution, you'd add:
96
- # from transformers import pipeline
97
- # asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
98
- # target_text = asr_pipeline(text_audio_path)["text"]
99
-
100
- # And for speaker embedding extraction from voice_audio_path:
101
- # This is the trickiest part with SpeechT5 directly. It expects a pre-computed xvector.
102
- # You could use a library like `pyannote.audio` to get xvectors:
103
- # from pyannote.audio import Inference
104
- # speaker_diarization_model = Inference("pyannote/[email protected]", device=0)
105
- # embeddings = speaker_diarization_model.get_embedding(voice_audio_path)
106
- # speaker_embeddings = torch.mean(embeddings, dim=0).unsqueeze(0) # or careful selection
107
-
108
- # Let's modify the Gradio interface to take TEXT input AND two audios
109
- # with clearer roles, or make a very strong assumption.
110
-
111
- # For a demonstration, let's assume `voice_audio_path` provides speaker characteristics,
112
- # AND we will use a generic default text, OR we ask for text explicitly.
113
-
114
- # Let's try to extract speaker embeddings using a more general method if possible,
115
- # but this is beyond SpeechT5's direct scope.
116
- # For demonstration, we will use a pre-defined speaker embedding.
117
- # To truly take it from `voice_audio_path`, you'd need an `inference` model
118
- # that generates these embeddings from arbitrary audio which is not part of `SpeechT5ForTextToSpeech`.
119
-
120
- # If you want to use the *content* of `text_audio_path` AND the *voice* of `voice_audio_path`:
121
- # The current SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
122
- # So, we'll take a simplified approach for the demo:
123
- # 1. We will use a *fixed text* (e.g., "Hello, this is a test of voice cloning.")
124
- # 2. We will attempt to use a speaker embedding. Since SpeechT5 needs a pre-computed x-vector,
125
- # and we can't easily extract it from an arbitrary `voice_audio_path` *within this simple setup*,
126
- # we will stick to the generic `speaker_embeddings` loaded earlier.
127
- # This means true "voice cloning based on the second input audio" is limited by the model's structure.
128
-
129
- # Let's reconsider the prompt: "تقلید صدای فایل اول را از فایل دوم".
130
- # This implies:
131
- # - File 1: The *source content* (what to say).
132
- # - File 2: The *target voice* (how to say it).
133
-
134
- # To do this properly, we need:
135
- # 1. ASR model to transcribe File 1.
136
- # 2. Speaker embedding model to extract embeddings from File 2.
137
- # 3. SpeechT5 to synthesize the text from 1 with embeddings from 2.
138
-
139
- # Let's use `pyaudio` for reading and `transformers` for ASR.
140
- # This will significantly increase the `requirements.txt`.
141
-
142
- # New plan:
143
- # Assume `text_audio_path` (File 1) is where we get the *text to speak*.
144
- # Assume `voice_audio_path` (File 2) is where we get the *speaker's voice characteristics*.
145
- # We need an ASR for File 1 and a speaker embedding extractor for File 2.
146
-
147
- # Let's install Whisper for ASR.
148
- # pip install git+https://github.com/huggingface/transformers.git openai-whisper optimum accelerate
149
-
150
- from transformers import pipeline
151
-
152
- # Initialize ASR pipeline
153
- asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=0 if torch.cuda.is_available() else -1)
154
 
155
- # Transcribe the content from the first audio file
156
- print(f"Transcribing {text_audio_path}...")
157
- transcription_result = asr_pipeline(text_audio_path)
158
- target_text = transcription_result["text"]
159
- print(f"Transcribed text: {target_text}")
160
-
161
- if not target_text:
162
- return None, "No discernible text extracted from the first audio file. Please try a clearer audio."
163
-
164
- # Extract speaker embeddings from the second audio file
165
- # This is the most challenging part with SpeechT5 directly.
166
- # Although SpeechT5 uses x-vectors, it doesn't provide a direct inference method
167
- # to extract them from arbitrary audio files for new speakers.
168
- # The 'speaker_embeddings' used in examples are usually pre-extracted or come from a dataset.
169
- # For a robust solution, you'd need a separate model like pyannote.audio's speaker embedding model.
170
- # For this demo, let's use a simpler approach:
171
- # We will try to use the `voice_audio_path` to generate a "speaker embedding"
172
- # but it won't be as precise as a dedicated model. This is where the
173
- # "low error" might be compromised if the speaker embedding isn't accurate.
174
-
175
- # For a proper solution, you'd need something like:
176
- # from pyannote.audio import Inference as SpeakerInference
177
- # speaker_embedding_model = SpeakerInference("pyannote/embedding", device=0)
178
- # waveform, sample_rate = torchaudio.load(voice_audio_path)
179
- # # Resample if necessary for the speaker embedding model
180
- # if sample_rate != speaker_embedding_model.sample_rate:
181
- # resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=speaker_embedding_model.sample_rate)
182
- # waveform = resampler(waveform)
183
- # speaker_embeddings_from_file = speaker_embedding_model(waveform.unsqueeze(0)).squeeze(0) # Assuming single speaker
184
- # speaker_embeddings = speaker_embeddings_from_file.unsqueeze(0)
185
-
186
- # Given the constraint to keep it simpler with available HuggingFace models (SpeechT5, Whisper),
187
- # let's acknowledge the limitation: directly extracting robust speaker embeddings from *arbitrary* audio
188
- # for SpeechT5 is not straightforward without more models.
189
- # We will use the *pre-defined* `speaker_embeddings` for the demo, which means
190
- # the "voice cloning" from `voice_audio_path` is highly simplified/simulated using a generic voice.
191
-
192
- # If `speaker_embeddings` were accurately derived from `voice_audio_path`, this would be true cloning.
193
- # For a basic demo, let's proceed with the generic speaker embedding to show the TTS part.
194
- # To truly use the voice from the second file, we need a way to extract x-vectors.
195
- # FOR THIS DEMO, WE WILL USE THE PREDEFINED SPEAKER EMBEDDING.
196
- # THE SECOND AUDIO FILE IS CURRENTLY NOT USED FOR SPEAKER EMBEDDING EXTRACTION.
197
- # This is a critical limitation for "voice cloning" *from* an arbitrary file.
198
-
199
- # If you specifically want to use the voice of the *second* file,
200
- # the recommended approach would be to extract x-vectors using a separate
201
- # model (e.g., from `pyannote.audio` or `speechbrain`).
202
- # Since `pyannote.audio` might add complexity to `requirements.txt`
203
- # and device handling, let's keep it with what's easily integrated by `transformers`.
204
-
205
- # For an actual voice cloning, the second audio file provides the speaker's timbre.
206
- # SpeechT5 accepts `speaker_embeddings`. How to get these from an arbitrary MP3?
207
- # This is the core problem. The models `google/speakertype` or `s3prl/pretrain_ecapa_tdnn`
208
- # can extract these. If we add `speechbrain` to requirements, we can do it.
209
-
210
- # Let's add `speechbrain` for speaker embedding extraction.
211
- # pip install speechbrain
212
-
213
- from speechbrain.inference.SpeakerEmbedding import SpeakerEmbedding as SpeechBrainSpeakerEmbedding
214
-
215
- # Initialize Speaker Embedding Model
216
- speaker_embedding_model_sb = SpeechBrainSpeakerEmbedding.from_hparams(
217
- source="speechbrain/spkrec-ecapa-tdnn",
218
- savedir="pretrained_models/spkrec-ecapa-tdnn",
219
- run_opts={"device":"cuda" if torch.cuda.is_available() else "cpu"}
220
- )
221
-
222
- print(f"Extracting speaker embedding from {voice_audio_path}...")
223
- # Load the second audio file for speaker embedding
224
- # SpeechBrain's `SpeakerEmbedding` expects a waveform tensor.
225
- # Load and resample if necessary
226
- voice_waveform, sr = torchaudio.load(voice_audio_path)
227
 
228
- # Need to ensure correct sampling rate and mono channel for `speechbrain`
229
- if voice_waveform.shape[0] > 1: # Convert to mono
230
- voice_waveform = voice_waveform.mean(dim=0, keepdim=True)
231
-
232
- # SpeechBrain's model expects a specific sampling rate (usually 16kHz)
233
- if sr != 16000:
234
- resampler = torchaudio.transforms.Resample(sr, 16000)
235
- voice_waveform = resampler(voice_waveform)
236
 
237
- # Extract the speaker embedding
238
- speaker_embeddings_from_voice_audio = speaker_embedding_model_sb.encode_batch(voice_waveform).squeeze(0)
239
- # SpeechT5 expects embeddings with shape (1, 512) for a single speaker
240
- speaker_embeddings = speaker_embeddings_from_voice_audio.unsqueeze(0)
241
- print("Speaker embedding extracted.")
242
-
243
- # Synthesize speech using SpeechT5
244
- inputs = processor(text=target_text, return_tensors="pt")
245
 
246
- # Move inputs and speaker_embeddings to the same device as the model
247
- if torch.cuda.is_available():
248
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
249
- model.to("cuda")
250
- vocoder.to("cuda")
251
- speaker_embeddings = speaker_embeddings.to("cuda")
252
-
253
- print("Generating speech...")
254
 
255
-
256
-
257
- # Generate speech
258
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
259
-
260
- # Normalize to be within [-1, 1] for audio playback
261
- speech = speech.cpu().numpy()
262
- max_val = max(abs(speech.min()), abs(speech.max()))
263
- if max_val > 1.0:
264
- speech = speech / max_val
265
 
266
- print("Speech generated. Saving to temporary file...")
267
-
268
- # Save the generated audio to a temporary file
269
- output_path = "cloned_voice_output.wav"
270
- sf.write(output_path, speech, vocoder.config.sampling_rate)
271
 
272
- return output_path, "Voice cloning successful!"
273
-
274
- except Exception as e:
275
- error_message = f"An error occurred: {e}"
276
- print(error_message)
277
- import traceback
278
- traceback.print_exc()
279
- return None, error_message
280
-
281
- # Gradio Interface
282
- iface = gr.Interface(
283
- fn=voice_clone,
284
- inputs=[
285
- gr.Audio(type="filepath", label="فایل صوتی اول (برای استخراج متن)", source="upload"),
286
- gr.Audio(type="filepath", label="فایل صوتی دوم (برای تقلید صدا)", source="upload")
287
- ],
288
- outputs=[
289
- gr.Audio(label="صدای شبیه سازی شده"),
290
- gr.Textbox(label="وضعیت")
291
- ],
292
- title="Voice Cloner (تقلید صدا) با HuggingFace",
293
- description="فایل صوتی اول را آپلود کنید تا متن آن استخراج شود. فایل صوتی دوم را آپلود کنید تا صدای آن تقلید شود و متن فایل اول با صدای فایل دوم تولید شود. (پشتیبانی از mp3/wav)",
294
- examples=[
295
- [
296
- "audio_examples/example_content.wav", # Example for content (what to say)
297
- "audio_examples/example_voice.wav" # Example for voice (how to say it)
298
- ]
299
- ]
300
- )
301
-
302
- if __name__ == "__main__":
303
- # Create an example directory and dummy files if they don't exist
304
- os.makedirs("audio_examples", exist_ok=True)
305
- if not os.path.exists("audio_examples/example_content.wav"):
306
- # Create a dummy WAV file for content
307
- import numpy as np
308
- samplerate = 16000
309
- duration = 2.0 # seconds
310
- frequency = 440 # Hz
311
- t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
312
- sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
313
- sf.write("audio_examples/example_content.wav", sine_wave.astype(np.float32), samplerate)
314
- print("Created dummy audio_examples/example_content.wav")
315
 
316
- if not os.path.exists("audio_examples/example_voice.wav"):
317
- # Create another dummy WAV file for voice
318
- samplerate = 16000
319
- duration = 1.5 # seconds
320
- frequency = 880 # Hz
321
- t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
322
- sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
323
- sf.write("audio_examples/example_voice.wav", sine_wave.astype(np.float32), samplerate)
324
- print("Created dummy audio_examples/example_voice.wav")
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
- iface.launch()
 
 
1
  import gradio as gr
 
 
 
2
  import torchaudio
3
+ from speechbrain.pretrained import EncoderClassifier
4
+ from speechbrain.pretrained import HIFIGAN
5
+ from speechbrain.pretrained import EncoderClassifier
6
+ import torch
7
+ import tempfile
8
  import os
9
+ from pathlib import Path
10
 
11
+ # Initialize models
12
+ classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
13
+ hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def extract_speaker_embedding(audio_file):
16
+ """Extract speaker embedding from audio file"""
17
+ signal, fs = torchaudio.load(audio_file)
18
+
19
+ # Resample if needed
20
+ if fs != 16000:
21
+ resampler = torchaudio.transforms.Resample(fs, 16000)
22
+ signal = resampler(signal)
23
+ fs = 16000
24
+
25
+ # Handle stereo audio
26
+ if signal.shape[0] > 1:
27
+ signal = torch.mean(signal, dim=0, keepdim=True)
28
+
29
+ embeddings = classifier.encode_batch(signal)
30
+ return embeddings.squeeze(0)
31
+
32
+ def voice_conversion(source_audio, target_audio):
33
+ """Convert source voice to sound like target voice"""
34
+ # Create temp files
35
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as source_tmp, \
36
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as target_tmp:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ source_path = source_tmp.name
39
+ target_path = target_tmp.name
40
+
41
+ # Save uploaded files
42
+ source_audio.save(source_path)
43
+ target_audio.save(target_path)
44
+
45
+ try:
46
+ # Extract source audio and target speaker embedding
47
+ source_signal, source_fs = torchaudio.load(source_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ # Handle stereo audio
50
+ if source_signal.shape[0] > 1:
51
+ source_signal = torch.mean(source_signal, dim=0, keepdim=True)
 
 
 
 
 
52
 
53
+ # Resample source to 16kHz if needed
54
+ if source_fs != 16000:
55
+ resampler = torchaudio.transforms.Resample(source_fs, 16000)
56
+ source_signal = resampler(source_signal)
57
+ source_fs = 16000
 
 
 
58
 
59
+ # Extract target speaker embedding
60
+ target_emb = extract_speaker_embedding(target_path)
 
 
 
 
 
 
61
 
62
+ # Generate converted waveform
63
+ waveform = hifi_gan.generate(source_signal, speaker_emb=target_emb)
 
 
 
 
 
 
 
 
64
 
65
+ # Save output
66
+ output_path = os.path.join(tempfile.mkdtemp(), "output.wav")
67
+ torchaudio.save(output_path, waveform.squeeze(0).cpu(), 16000)
 
 
68
 
69
+ return output_path
70
+ finally:
71
+ # Clean up temp files
72
+ os.unlink(source_path)
73
+ os.unlink(target_path)
74
+
75
+ # Gradio interface
76
+ with gr.Blocks() as demo:
77
+ gr.Markdown("# 🎙️ Voice Changer")
78
+ gr.Markdown("بارگذاری فایل ��وتی اصلی و فایل صوتی هدف برای تبدیل صدای اول به سبک دوم")
79
+
80
+ with gr.Row():
81
+ with gr.Column():
82
+ source_audio = gr.Audio(label="فایل صوتی اصلی (صدا برای تبدیل)", type="filepath")
83
+ with gr.Column():
84
+ target_audio = gr.Audio(label="فایل صوتی هدف (سبک مورد نظر)", type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ output_audio = gr.Audio(label="خروجی تبدیل شده", interactive=False)
87
+
88
+ convert_btn = gr.Button("تبدیل صوت")
89
+ convert_btn.click(
90
+ fn=voice_conversion,
91
+ inputs=[source_audio, target_audio],
92
+ outputs=output_audio
93
+ )
94
+
95
+ gr.Examples(
96
+ examples=[
97
+ [os.path.join(os.path.dirname(__file__), "examples/source1.wav"),
98
+ os.path.join(os.path.dirname(__file__), "examples/target1.wav")],
99
+ [os.path.join(os.path.dirname(__file__), "examples/source2.wav"),
100
+ os.path.join(os.path.dirname(__file__), "examples/target2.wav")]
101
+ ],
102
+ inputs=[source_audio, target_audio],
103
+ outputs=output_audio,
104
+ fn=voice_conversion,
105
+ cache_examples=True
106
+ )
107
 
108
+ if __name__ == "__main__":
109
+ demo.launch()