File size: 18,083 Bytes
1b2f7fd
ec585dc
 
1b2f7fd
 
 
ec585dc
 
 
 
 
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
94c2b30
 
ec585dc
 
 
 
 
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
1b2f7fd
ec585dc
 
1b2f7fd
ec585dc
 
 
 
 
1b2f7fd
ec585dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b2f7fd
 
ec585dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import gradio as gr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import torchaudio
import soundfile as sf
import os

# Load models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load speaker embeddings (we'll use a sample speaker for simplicity)
# For real voice cloning, you'd extract embeddings from your target audio.
# For a quick demo, we'll use a pre-defined one from a dataset.
# This is just an example, a robust voice cloner would extract embeddings directly.
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

# You need to choose an appropriate speaker embedding. 
# For a general solution, you'd typically process an *encoding* of the target voice.
# Let's take a sample speaker embedding. In a real application, 
# you'd extract this from the second audio file.
# As a placeholder, we'll use a pre-existing one.
# For true voice cloning with the second audio, you'd need a model
# that can extract speaker embeddings from arbitrary audio.
# SpeechT5 itself doesn't directly provide a pre-trained model for this from an arbitrary audio.
# Typically, you'd use a separate speaker embedding extraction model (e.g., from ECAPA-TDNN).
# For this example, let's use a placeholder.
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def voice_clone(text_audio_path, voice_audio_path):
    """
    Clones the voice from voice_audio_path to speak the content of text_audio_path.
    (Note: This implementation primarily uses text from text_audio_path and applies a generic voice.
           True "cloning" from the *content* of text_audio_path while applying the *style*
           of voice_audio_path for arbitrary content is more advanced.
           Here, we assume text_audio_path provides the text to be spoken,
           and voice_audio_path provides the *target voice characteristics*.)
    """
    try:
        # 1. Read the audio file where we want to extract the content (text or speech)
        # For simplicity, we'll assume the text_audio_path contains speech that we want to transcribe
        # and then re-synthesize in the style of voice_audio_path.
        # NOTE: SpeechT5 is primarily Text-to-Speech. To get text from text_audio_path,
        # you would need an ASR (Automatic Speech Recognition) model.
        # For a simpler demo, let's assume text_audio_path *could be transcribed*
        # or directly provides the text.

        # For this example, let's assume `text_audio_path` is the source of the *text*
        # and `voice_audio_path` is the source of the *voice characteristics*.

        # Step 1: Read the audio file to get the content/text (A real implementation would need ASR here)
        # As a placeholder, let's just make up some text or simplify.
        # A more robust solution would involve:
        #    a) ASR on `text_audio_path` to get the text.
        #    b) Feature extraction from `voice_audio_path` to get an accurate speaker embedding.

        # For now, let's simplify and make a strong assumption:
        # The user provides a text content *implicitly* via the first audio.
        # For ASR, we'd need another model. Let's make an assumption for the demo:
        # we will use a fixed text, and apply the speaker embedding from `voice_audio_path`.
        # THIS IS NOT TRUE VOICE CLONING OF *CONTENT* + *VOICE*.
        # It's more Text-to-Speech with a specific speaker.

        # Let's say, for demonstration, we will let the user type the text,
        # OR pretend to extract text from `text_audio_path` and use a generic speaker embedding
        # if we cannot extract accurate speaker embeddings from arbitrary audio using SpeechT5 directly.

        # A more practical approach for your request:
        # 1. User provides "source audio" (audio_1). We want to extract *content/text* from it.
        # 2. User provides "target voice audio" (audio_2). We want to extract *speaker identity* from it.
        # 3. Synthesize the extracted text with the extracted speaker identity.

        # SpeechT5 doesn't have a direct "extract speaker embedding from arbitrary audio" function
        # built into its `SpeechT5Processor`. You'd typically use a separate model for that (e.g., ECAPA-TDNN).
        # For the sake of this demo with SpeechT5, we will use a pre-extracted speaker embedding.
        # This means true "cloning the voice *from* the second file" (if that file is arbitrary)
        # is complex.

        # Let's refine the approach for a Hugging Face model combo:
        #   - Use a robust ASR model (e.g., Whisper) for the first audio to get text.
        #   - Use a speaker embedding model (e.g., from pyannote.audio or similar) for the second audio.
        #   - Use SpeechT5 for TTS.

        # Given the prompt, "تقلید صدای فایل اول را از فایل دوم", it implies:
        # File 1: Provides the *content* (what is said).
        # File 2: Provides the *voice style/identity*.

        # For a simplified demo *without* an ASR model and a dedicated speaker embedding extractor:
        # We will prompt the user for the text they want to say.
        # The second audio will be *assumed* to be able to provide a speaker embedding.

        # For a robust solution, you'd add:
        # from transformers import pipeline
        # asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
        # target_text = asr_pipeline(text_audio_path)["text"]

        # And for speaker embedding extraction from voice_audio_path:
        # This is the trickiest part with SpeechT5 directly. It expects a pre-computed xvector.
        # You could use a library like `pyannote.audio` to get xvectors:
        # from pyannote.audio import Inference
        # speaker_diarization_model = Inference("pyannote/[email protected]", device=0)
        # embeddings = speaker_diarization_model.get_embedding(voice_audio_path)
        # speaker_embeddings = torch.mean(embeddings, dim=0).unsqueeze(0) # or careful selection

        # Let's modify the Gradio interface to take TEXT input AND two audios
        # with clearer roles, or make a very strong assumption.

        # For a demonstration, let's assume `voice_audio_path` provides speaker characteristics,
        # AND we will use a generic default text, OR we ask for text explicitly.

        # Let's try to extract speaker embeddings using a more general method if possible,
        # but this is beyond SpeechT5's direct scope.
        # For demonstration, we will use a pre-defined speaker embedding.
        # To truly take it from `voice_audio_path`, you'd need an `inference` model
        # that generates these embeddings from arbitrary audio which is not part of `SpeechT5ForTextToSpeech`.
        
        # If you want to use the *content* of `text_audio_path` AND the *voice* of `voice_audio_path`:
        # The current SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
        # So, we'll take a simplified approach for the demo:
        # 1. We will use a *fixed text* (e.g., "Hello, this is a test of voice cloning.")
        # 2. We will attempt to use a speaker embedding. Since SpeechT5 needs a pre-computed x-vector,
        #    and we can't easily extract it from an arbitrary `voice_audio_path` *within this simple setup*,
        #    we will stick to the generic `speaker_embeddings` loaded earlier.
        # This means true "voice cloning based on the second input audio" is limited by the model's structure.

        # Let's reconsider the prompt: "تقلید صدای فایل اول را از فایل دوم".
        # This implies:
        # - File 1: The *source content* (what to say).
        # - File 2: The *target voice* (how to say it).

        # To do this properly, we need:
        # 1. ASR model to transcribe File 1.
        # 2. Speaker embedding model to extract embeddings from File 2.
        # 3. SpeechT5 to synthesize the text from 1 with embeddings from 2.

        # Let's use `pyaudio` for reading and `transformers` for ASR.
        # This will significantly increase the `requirements.txt`.

        # New plan:
        # Assume `text_audio_path` (File 1) is where we get the *text to speak*.
        # Assume `voice_audio_path` (File 2) is where we get the *speaker's voice characteristics*.
        # We need an ASR for File 1 and a speaker embedding extractor for File 2.

        # Let's install Whisper for ASR.
        # pip install git+https://github.com/huggingface/transformers.git openai-whisper optimum accelerate

        from transformers import pipeline
        
        # Initialize ASR pipeline
        asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=0 if torch.cuda.is_available() else -1)
        
        # Transcribe the content from the first audio file
        print(f"Transcribing {text_audio_path}...")
        transcription_result = asr_pipeline(text_audio_path)
        target_text = transcription_result["text"]
        print(f"Transcribed text: {target_text}")

        if not target_text:
            return None, "No discernible text extracted from the first audio file. Please try a clearer audio."

        # Extract speaker embeddings from the second audio file
        # This is the most challenging part with SpeechT5 directly.
        # Although SpeechT5 uses x-vectors, it doesn't provide a direct inference method
        # to extract them from arbitrary audio files for new speakers.
        # The 'speaker_embeddings' used in examples are usually pre-extracted or come from a dataset.
        # For a robust solution, you'd need a separate model like pyannote.audio's speaker embedding model.
        # For this demo, let's use a simpler approach:
        # We will try to use the `voice_audio_path` to generate a "speaker embedding"
        # but it won't be as precise as a dedicated model. This is where the
        # "low error" might be compromised if the speaker embedding isn't accurate.

        # For a proper solution, you'd need something like:
        # from pyannote.audio import Inference as SpeakerInference
        # speaker_embedding_model = SpeakerInference("pyannote/embedding", device=0)
        # waveform, sample_rate = torchaudio.load(voice_audio_path)
        # # Resample if necessary for the speaker embedding model
        # if sample_rate != speaker_embedding_model.sample_rate:
        #    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=speaker_embedding_model.sample_rate)
        #    waveform = resampler(waveform)
        # speaker_embeddings_from_file = speaker_embedding_model(waveform.unsqueeze(0)).squeeze(0) # Assuming single speaker
        # speaker_embeddings = speaker_embeddings_from_file.unsqueeze(0)

        # Given the constraint to keep it simpler with available HuggingFace models (SpeechT5, Whisper),
        # let's acknowledge the limitation: directly extracting robust speaker embeddings from *arbitrary* audio
        # for SpeechT5 is not straightforward without more models.
        # We will use the *pre-defined* `speaker_embeddings` for the demo, which means
        # the "voice cloning" from `voice_audio_path` is highly simplified/simulated using a generic voice.

        # If `speaker_embeddings` were accurately derived from `voice_audio_path`, this would be true cloning.
        # For a basic demo, let's proceed with the generic speaker embedding to show the TTS part.
        # To truly use the voice from the second file, we need a way to extract x-vectors.
        # FOR THIS DEMO, WE WILL USE THE PREDEFINED SPEAKER EMBEDDING.
        # THE SECOND AUDIO FILE IS CURRENTLY NOT USED FOR SPEAKER EMBEDDING EXTRACTION.
        # This is a critical limitation for "voice cloning" *from* an arbitrary file.

        # If you specifically want to use the voice of the *second* file,
        # the recommended approach would be to extract x-vectors using a separate
        # model (e.g., from `pyannote.audio` or `speechbrain`).
        # Since `pyannote.audio` might add complexity to `requirements.txt`
        # and device handling, let's keep it with what's easily integrated by `transformers`.

        # For an actual voice cloning, the second audio file provides the speaker's timbre.
        # SpeechT5 accepts `speaker_embeddings`. How to get these from an arbitrary MP3?
        # This is the core problem. The models `google/speakertype` or `s3prl/pretrain_ecapa_tdnn`
        # can extract these. If we add `speechbrain` to requirements, we can do it.

        # Let's add `speechbrain` for speaker embedding extraction.
        # pip install speechbrain

        from speechbrain.inference.SpeakerEmbedding import SpeakerEmbedding as SpeechBrainSpeakerEmbedding

        # Initialize Speaker Embedding Model
        speaker_embedding_model_sb = SpeechBrainSpeakerEmbedding.from_hparams(
            source="speechbrain/spkrec-ecapa-tdnn",
            savedir="pretrained_models/spkrec-ecapa-tdnn",
            run_opts={"device":"cuda" if torch.cuda.is_available() else "cpu"}
        )

        print(f"Extracting speaker embedding from {voice_audio_path}...")
        # Load the second audio file for speaker embedding
        # SpeechBrain's `SpeakerEmbedding` expects a waveform tensor.
        # Load and resample if necessary
        voice_waveform, sr = torchaudio.load(voice_audio_path)
        
        # Need to ensure correct sampling rate and mono channel for `speechbrain`
        if voice_waveform.shape[0] > 1: # Convert to mono
            voice_waveform = voice_waveform.mean(dim=0, keepdim=True)

        # SpeechBrain's model expects a specific sampling rate (usually 16kHz)
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            voice_waveform = resampler(voice_waveform)
        
        # Extract the speaker embedding
        speaker_embeddings_from_voice_audio = speaker_embedding_model_sb.encode_batch(voice_waveform).squeeze(0)
        # SpeechT5 expects embeddings with shape (1, 512) for a single speaker
        speaker_embeddings = speaker_embeddings_from_voice_audio.unsqueeze(0)
        print("Speaker embedding extracted.")

        # Synthesize speech using SpeechT5
        inputs = processor(text=target_text, return_tensors="pt")
        
        # Move inputs and speaker_embeddings to the same device as the model
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
            model.to("cuda")
            vocoder.to("cuda")
            speaker_embeddings = speaker_embeddings.to("cuda")

        print("Generating speech...")
        


        # Generate speech
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

        # Normalize to be within [-1, 1] for audio playback
        speech = speech.cpu().numpy()
        max_val = max(abs(speech.min()), abs(speech.max()))
        if max_val > 1.0:
            speech = speech / max_val
        
        print("Speech generated. Saving to temporary file...")

        # Save the generated audio to a temporary file
        output_path = "cloned_voice_output.wav"
        sf.write(output_path, speech, vocoder.config.sampling_rate)
        
        return output_path, "Voice cloning successful!"

    except Exception as e:
        error_message = f"An error occurred: {e}"
        print(error_message)
        import traceback
        traceback.print_exc()
        return None, error_message

# Gradio Interface
iface = gr.Interface(
    fn=voice_clone,
    inputs=[
        gr.Audio(type="filepath", label="فایل صوتی اول (برای استخراج متن)", source="upload"),
        gr.Audio(type="filepath", label="فایل صوتی دوم (برای تقلید صدا)", source="upload")
    ],
    outputs=[
        gr.Audio(label="صدای شبیه سازی شده"),
        gr.Textbox(label="وضعیت")
    ],
    title="Voice Cloner (تقلید صدا) با HuggingFace",
    description="فایل صوتی اول را آپلود کنید تا متن آن استخراج شود. فایل صوتی دوم را آپلود کنید تا صدای آن تقلید شود و متن فایل اول با صدای فایل دوم تولید شود. (پشتیبانی از mp3/wav)",
    examples=[
        [
            "audio_examples/example_content.wav", # Example for content (what to say)
            "audio_examples/example_voice.wav"    # Example for voice (how to say it)
        ]
    ]
)

if __name__ == "__main__":
    # Create an example directory and dummy files if they don't exist
    os.makedirs("audio_examples", exist_ok=True)
    if not os.path.exists("audio_examples/example_content.wav"):
        # Create a dummy WAV file for content
        import numpy as np
        samplerate = 16000
        duration = 2.0  # seconds
        frequency = 440  # Hz
        t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
        sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
        sf.write("audio_examples/example_content.wav", sine_wave.astype(np.float32), samplerate)
        print("Created dummy audio_examples/example_content.wav")
    
    if not os.path.exists("audio_examples/example_voice.wav"):
        # Create another dummy WAV file for voice
        samplerate = 16000
        duration = 1.5  # seconds
        frequency = 880  # Hz
        t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
        sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
        sf.write("audio_examples/example_voice.wav", sine_wave.astype(np.float32), samplerate)
        print("Created dummy audio_examples/example_voice.wav")

    iface.launch()