Spaces:

suprimedev
/

cloner

Sleeping

App Files Files Community

suprimedev commited on Jun 5

Commit

ec585dc

verified ·

1 Parent(s): a1242a1

Update app.py

Browse files

Files changed (1) hide show

app.py +316 -80

app.py CHANGED Viewed

@@ -1,100 +1,336 @@
 import gradio as gr
 import torch
 import torchaudio
-from torchaudio.transforms import Resample
-import numpy as np
-import os
-from datetime import datetime
 import soundfile as sf
-# بررسی وجود GPU
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-def load_audio(file_path, target_sr=16000):
-    """بارگذاری فایل صوتی و تبدیل به تنسور"""
     try:
-        waveform, sample_rate = torchaudio.load(file_path)
-        # تغییر نرخ نمونه‌برداری در صورت نیاز
-        if sample_rate != target_sr:
-            resampler = Resample(sample_rate, target_sr)
-            waveform = resampler(waveform)
-        return waveform.to(device), target_sr
-    except Exception as e:
-        raise gr.Error(f"خطا در بارگذاری فایل صوتی: {str(e)}")
-def preprocess_audio(waveform, sr):
-    """پیش‌پردازش سیگنال صوتی"""
-    # نرمالایز کردن سیگنال
-    waveform = waveform / torch.max(torch.abs(waveform))
-    # تبدیل به مونو اگر استریو باشد
-    if waveform.shape[0] > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-    return waveform
-def clone_voice(source_audio, target_audio, output_format="wav"):
-    """عمل کلون کردن صدا"""
-    try:
-        # بارگذاری فایل‌های صوتی
-        source_waveform, source_sr = load_audio(source_audio)
-        target_waveform, target_sr = load_audio(target_audio)
-        # پیش‌پردازش
-        source_waveform = preprocess_audio(source_waveform, source_sr)
-        target_waveform = preprocess_audio(target_waveform, target_sr)
-        # در اینجا باید مدل تبدیل صدا اعمال شود
-        # این قسمت ساده‌سازی شده و نیاز به پیاده‌سازی واقعی دارد
-        # برای نمونه، فقط طول موج هدف را با طول موج منبع هماهنگ می‌کنیم
-        min_len = min(source_waveform.shape[1], target_waveform.shape[1])
-        converted_waveform = target_waveform[:, :min_len]
-        # ذخیره فایل نتیجه
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_filename = f"output_{timestamp}.{output_format.lower()}"
-        # ذخیره با soundfile که با فرمت‌های مختلف کار می‌کند
-        sf.write(output_filename, converted_waveform.cpu().numpy().T, target_sr)
-        return output_filename
     except Exception as e:
-        raise gr.Error(f"خطا در پردازش صدا: {str(e)}")
-# رابط Gradio
-with gr.Blocks(title="Voice Cloner") as demo:
-    gr.Markdown("# 🎤 Voice Cloner")
-    gr.Markdown("بارگذاری فایل صوتی منبع و فایل صوتی هدف برای کلون کردن صدا")
-    with gr.Row():
-        with gr.Column():
-            source_audio = gr.Audio(label="فایل صوتی منبع (صدا برای کپی کردن)", type="filepath")
-            target_audio = gr.Audio(label="فایل صوتی هدف (محتوا برای تبدیل)", type="filepath")
-            output_format = gr.Dropdown(
-                choices=["wav", "mp3"],
-                value="wav",
-                label="فرمت فایل خروجی",
-                interactive=True
-            )
-            submit_btn = gr.Button("شروع تبدیل صدا")
-        with gr.Column():
-            output_audio = gr.Audio(label="فایل صوتی نتیجه", interactive=False)
-            download_link = gr.File(label="دانلود فایل نتیجه")
-    submit_btn.click(
-        fn=clone_voice,
-        inputs=[source_audio, target_audio, output_format],
-        outputs=[output_audio]
-    )
-    output_audio.change(
-        lambda x: gr.File(value=x),
-        inputs=[output_audio],
-        outputs=[download_link]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
 import torch
 import torchaudio
 import soundfile as sf
+import os
+# Load models
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# Load speaker embeddings (we'll use a sample speaker for simplicity)
+# For real voice cloning, you'd extract embeddings from your target audio.
+# For a quick demo, we'll use a pre-defined one from a dataset.
+# This is just an example, a robust voice cloner would extract embeddings directly.
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+# You need to choose an appropriate speaker embedding.
+# For a general solution, you'd typically process an *encoding* of the target voice.
+# Let's take a sample speaker embedding. In a real application,
+# you'd extract this from the second audio file.
+# As a placeholder, we'll use a pre-existing one.
+# For true voice cloning with the second audio, you'd need a model
+# that can extract speaker embeddings from arbitrary audio.
+# SpeechT5 itself doesn't directly provide a pre-trained model for this from an arbitrary audio.
+# Typically, you'd use a separate speaker embedding extraction model (e.g., from ECAPA-TDNN).
+# For this example, let's use a placeholder.
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+def voice_clone(text_audio_path, voice_audio_path):
+    """
+    Clones the voice from voice_audio_path to speak the content of text_audio_path.
+    (Note: This implementation primarily uses text from text_audio_path and applies a generic voice.
+           True "cloning" from the *content* of text_audio_path while applying the *style*
+           of voice_audio_path for arbitrary content is more advanced.
+           Here, we assume text_audio_path provides the text to be spoken,
+           and voice_audio_path provides the *target voice characteristics*.)
+    """
     try:
+        # 1. Read the audio file where we want to extract the content (text or speech)
+        # For simplicity, we'll assume the text_audio_path contains speech that we want to transcribe
+        # and then re-synthesize in the style of voice_audio_path.
+        # NOTE: SpeechT5 is primarily Text-to-Speech. To get text from text_audio_path,
+        # you would need an ASR (Automatic Speech Recognition) model.
+        # For a simpler demo, let's assume text_audio_path *could be transcribed*
+        # or directly provides the text.
+        # For this example, let's assume `text_audio_path` is the source of the *text*
+        # and `voice_audio_path` is the source of the *voice characteristics*.
+        # Step 1: Read the audio file to get the content/text (A real implementation would need ASR here)
+        # As a placeholder, let's just make up some text or simplify.
+        # A more robust solution would involve:
+        #    a) ASR on `text_audio_path` to get the text.
+        #    b) Feature extraction from `voice_audio_path` to get an accurate speaker embedding.
+        # For now, let's simplify and make a strong assumption:
+        # The user provides a text content *implicitly* via the first audio.
+        # For ASR, we'd need another model. Let's make an assumption for the demo:
+        # we will use a fixed text, and apply the speaker embedding from `voice_audio_path`.
+        # THIS IS NOT TRUE VOICE CLONING OF *CONTENT* + *VOICE*.
+        # It's more Text-to-Speech with a specific speaker.
+        # Let's say, for demonstration, we will let the user type the text,
+        # OR pretend to extract text from `text_audio_path` and use a generic speaker embedding
+        # if we cannot extract accurate speaker embeddings from arbitrary audio using SpeechT5 directly.
+        # A more practical approach for your request:
+        # 1. User provides "source audio" (audio_1). We want to extract *content/text* from it.
+        # 2. User provides "target voice audio" (audio_2). We want to extract *speaker identity* from it.
+        # 3. Synthesize the extracted text with the extracted speaker identity.
+        # SpeechT5 doesn't have a direct "extract speaker embedding from arbitrary audio" function
+        # built into its `SpeechT5Processor`. You'd typically use a separate model for that (e.g., ECAPA-TDNN).
+        # For the sake of this demo with SpeechT5, we will use a pre-extracted speaker embedding.
+        # This means true "cloning the voice *from* the second file" (if that file is arbitrary)
+        # is complex.
+        # Let's refine the approach for a Hugging Face model combo:
+        #   - Use a robust ASR model (e.g., Whisper) for the first audio to get text.
+        #   - Use a speaker embedding model (e.g., from pyannote.audio or similar) for the second audio.
+        #   - Use SpeechT5 for TTS.
+        # Given the prompt, "تقلید صدای فایل اول را از فایل دوم", it implies:
+        # File 1: Provides the *content* (what is said).
+        # File 2: Provides the *voice style/identity*.
+        # For a simplified demo *without* an ASR model and a dedicated speaker embedding extractor:
+        # We will prompt the user for the text they want to say.
+        # The second audio will be *assumed* to be able to provide a speaker embedding.
+        # For a robust solution, you'd add:
+        # from transformers import pipeline
+        # asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
+        # target_text = asr_pipeline(text_audio_path)["text"]
+        # And for speaker embedding extraction from voice_audio_path:
+        # This is the trickiest part with SpeechT5 directly. It expects a pre-computed xvector.
+        # You could use a library like `pyannote.audio` to get xvectors:
+        # from pyannote.audio import Inference
+        # speaker_diarization_model = Inference("pyannote/[email protected]", device=0)
+        # embeddings = speaker_diarization_model.get_embedding(voice_audio_path)
+        # speaker_embeddings = torch.mean(embeddings, dim=0).unsqueeze(0) # or careful selection
+        # Let's modify the Gradio interface to take TEXT input AND two audios
+        # with clearer roles, or make a very strong assumption.
+        # For a demonstration, let's assume `voice_audio_path` provides speaker characteristics,
+        # AND we will use a generic default text, OR we ask for text explicitly.
+        # Let's try to extract speaker embeddings using a more general method if possible,
+        # but this is beyond SpeechT5's direct scope.
+        # For demonstration, we will use a pre-defined speaker embedding.
+        # To truly take it from `voice_audio_path`, you'd need an `inference` model
+        # that generates these embeddings from arbitrary audio which is not part of `SpeechT5ForTextToSpeech`.
+        # If you want to use the *content* of `text_audio_path` AND the *voice* of `voice_audio_path`:
+        # The current SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
+        # So, we'll take a simplified approach for the demo:
+        # 1. We will use a *fixed text* (e.g., "Hello, this is a test of voice cloning.")
+        # 2. We will attempt to use a speaker embedding. Since SpeechT5 needs a pre-computed x-vector,
+        #    and we can't easily extract it from an arbitrary `voice_audio_path` *within this simple setup*,
+        #    we will stick to the generic `speaker_embeddings` loaded earlier.
+        # This means true "voice cloning based on the second input audio" is limited by the model's structure.
+        # Let's reconsider the prompt: "تقلید صدای فایل اول را از فایل دوم".
+        # This implies:
+        # - File 1: The *source content* (what to say).
+        # - File 2: The *target voice* (how to say it).
+        # To do this properly, we need:
+        # 1. ASR model to transcribe File 1.
+        # 2. Speaker embedding model to extract embeddings from File 2.
+        # 3. SpeechT5 to synthesize the text from 1 with embeddings from 2.
+        # Let's use `pyaudio` for reading and `transformers` for ASR.
+        # This will significantly increase the `requirements.txt`.
+        # New plan:
+        # Assume `text_audio_path` (File 1) is where we get the *text to speak*.
+        # Assume `voice_audio_path` (File 2) is where we get the *speaker's voice characteristics*.
+        # We need an ASR for File 1 and a speaker embedding extractor for File 2.
+        # Let's install Whisper for ASR.
+        # pip install git+https://github.com/huggingface/transformers.git openai-whisper optimum accelerate
+        from transformers import pipeline
+        # Initialize ASR pipeline
+        asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=0 if torch.cuda.is_available() else -1)
+        # Transcribe the content from the first audio file
+        print(f"Transcribing {text_audio_path}...")
+        transcription_result = asr_pipeline(text_audio_path)
+        target_text = transcription_result["text"]
+        print(f"Transcribed text: {target_text}")
+        if not target_text:
+            return None, "No discernible text extracted from the first audio file. Please try a clearer audio."
+        # Extract speaker embeddings from the second audio file
+        # This is the most challenging part with SpeechT5 directly.
+        # Although SpeechT5 uses x-vectors, it doesn't provide a direct inference method
+        # to extract them from arbitrary audio files for new speakers.
+        # The 'speaker_embeddings' used in examples are usually pre-extracted or come from a dataset.
+        # For a robust solution, you'd need a separate model like pyannote.audio's speaker embedding model.
+        # For this demo, let's use a simpler approach:
+        # We will try to use the `voice_audio_path` to generate a "speaker embedding"
+        # but it won't be as precise as a dedicated model. This is where the
+        # "low error" might be compromised if the speaker embedding isn't accurate.
+        # For a proper solution, you'd need something like:
+        # from pyannote.audio import Inference as SpeakerInference
+        # speaker_embedding_model = SpeakerInference("pyannote/embedding", device=0)
+        # waveform, sample_rate = torchaudio.load(voice_audio_path)
+        # # Resample if necessary for the speaker embedding model
+        # if sample_rate != speaker_embedding_model.sample_rate:
+        #    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=speaker_embedding_model.sample_rate)
+        #    waveform = resampler(waveform)
+        # speaker_embeddings_from_file = speaker_embedding_model(waveform.unsqueeze(0)).squeeze(0) # Assuming single speaker
+        # speaker_embeddings = speaker_embeddings_from_file.unsqueeze(0)
+        # Given the constraint to keep it simpler with available HuggingFace models (SpeechT5, Whisper),
+        # let's acknowledge the limitation: directly extracting robust speaker embeddings from *arbitrary* audio
+        # for SpeechT5 is not straightforward without more models.
+        # We will use the *pre-defined* `speaker_embeddings` for the demo, which means
+        # the "voice cloning" from `voice_audio_path` is highly simplified/simulated using a generic voice.
+        # If `speaker_embeddings` were accurately derived from `voice_audio_path`, this would be true cloning.
+        # For a basic demo, let's proceed with the generic speaker embedding to show the TTS part.
+        # To truly use the voice from the second file, we need a way to extract x-vectors.
+        # FOR THIS DEMO, WE WILL USE THE PREDEFINED SPEAKER EMBEDDING.
+        # THE SECOND AUDIO FILE IS CURRENTLY NOT USED FOR SPEAKER EMBEDDING EXTRACTION.
+        # This is a critical limitation for "voice cloning" *from* an arbitrary file.
+        # If you specifically want to use the voice of the *second* file,
+        # the recommended approach would be to extract x-vectors using a separate
+        # model (e.g., from `pyannote.audio` or `speechbrain`).
+        # Since `pyannote.audio` might add complexity to `requirements.txt`
+        # and device handling, let's keep it with what's easily integrated by `transformers`.
+        # For an actual voice cloning, the second audio file provides the speaker's timbre.
+        # SpeechT5 accepts `speaker_embeddings`. How to get these from an arbitrary MP3?
+        # This is the core problem. The models `google/speakertype` or `s3prl/pretrain_ecapa_tdnn`
+        # can extract these. If we add `speechbrain` to requirements, we can do it.
+        # Let's add `speechbrain` for speaker embedding extraction.
+        # pip install speechbrain
+        from speechbrain.inference.SpeakerEmbedding import SpeakerEmbedding as SpeechBrainSpeakerEmbedding
+        # Initialize Speaker Embedding Model
+        speaker_embedding_model_sb = SpeechBrainSpeakerEmbedding.from_hparams(
+            source="speechbrain/spkrec-ecapa-tdnn",
+            savedir="pretrained_models/spkrec-ecapa-tdnn",
+            run_opts={"device":"cuda" if torch.cuda.is_available() else "cpu"}
+        )
+        print(f"Extracting speaker embedding from {voice_audio_path}...")
+        # Load the second audio file for speaker embedding
+        # SpeechBrain's `SpeakerEmbedding` expects a waveform tensor.
+        # Load and resample if necessary
+        voice_waveform, sr = torchaudio.load(voice_audio_path)
+        # Need to ensure correct sampling rate and mono channel for `speechbrain`
+        if voice_waveform.shape[0] > 1: # Convert to mono
+            voice_waveform = voice_waveform.mean(dim=0, keepdim=True)
+        # SpeechBrain's model expects a specific sampling rate (usually 16kHz)
+        if sr != 16000:
+            resampler = torchaudio.transforms.Resample(sr, 16000)
+            voice_waveform = resampler(voice_waveform)
+        # Extract the speaker embedding
+        speaker_embeddings_from_voice_audio = speaker_embedding_model_sb.encode_batch(voice_waveform).squeeze(0)
+        # SpeechT5 expects embeddings with shape (1, 512) for a single speaker
+        speaker_embeddings = speaker_embeddings_from_voice_audio.unsqueeze(0)
+        print("Speaker embedding extracted.")
+        # Synthesize speech using SpeechT5
+        inputs = processor(text=target_text, return_tensors="pt")
+        # Move inputs and speaker_embeddings to the same device as the model
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+            model.to("cuda")
+            vocoder.to("cuda")
+            speaker_embeddings = speaker_embeddings.to("cuda")
+        print("Generating speech...")
+        🐸 SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
+        The 'speaker_embeddings' in examples are usually pre_extracted or comes from dataset
+        FOR A ROBUST SOLUTION using custom audio, you need a separate model like pyannote.audio or SpeechBrain
+        This part of the code assumes that you have already extracted *speaker_embeddings* (x-vector) from the second audio file,
+        which contains the voice you want to clone. If not, it will use a a generic pre-defined embedding or raise error.
+        This is the trickiest part for direct voice cloning with arbitrary audio using SpeechT5.
+        For this demo, we'll implement both:
+          1. Basic version with predefined speaker embedding (simpler, less true cloning).
+          2. Advanced version with SpeechBrain for speaker embedding extraction (more accurate cloning).
+        Let's go with the advanced version to meet the "low error" requirement for cloning.
+        ```
+        # Generate speech
+        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+        # Normalize to be within [-1, 1] for audio playback
+        speech = speech.cpu().numpy()
+        max_val = max(abs(speech.min()), abs(speech.max()))
+        if max_val > 1.0:
+            speech = speech / max_val
+        print("Speech generated. Saving to temporary file...")
+        # Save the generated audio to a temporary file
+        output_path = "cloned_voice_output.wav"
+        sf.write(output_path, speech, vocoder.config.sampling_rate)
+        return output_path, "Voice cloning successful!"
     except Exception as e:
+        error_message = f"An error occurred: {e}"
+        print(error_message)
+        import traceback
+        traceback.print_exc()
+        return None, error_message
+# Gradio Interface
+iface = gr.Interface(
+    fn=voice_clone,
+    inputs=[
+        gr.Audio(type="filepath", label="فایل صوتی اول (برای استخراج متن)", source="upload"),
+        gr.Audio(type="filepath", label="فایل صوتی دوم (برای تقلید صدا)", source="upload")
+    ],
+    outputs=[
+        gr.Audio(label="صدای شبیه سازی شده"),
+        gr.Textbox(label="وضعیت")
+    ],
+    title="Voice Cloner (تقلید صدا) با HuggingFace",
+    description="فایل صوتی اول را آپلود کنید تا متن آن استخراج شود. فایل صوتی دوم را آپلود کنید تا صدای آن تقلید شود و متن فایل اول با صدای فایل دوم تولید شود. (پشتیبانی از mp3/wav)",
+    examples=[
+        [
+            "audio_examples/example_content.wav", # Example for content (what to say)
+            "audio_examples/example_voice.wav"    # Example for voice (how to say it)
+        ]
+    ]
+)
 if __name__ == "__main__":
+    # Create an example directory and dummy files if they don't exist
+    os.makedirs("audio_examples", exist_ok=True)
+    if not os.path.exists("audio_examples/example_content.wav"):
+        # Create a dummy WAV file for content
+        import numpy as np
+        samplerate = 16000
+        duration = 2.0  # seconds
+        frequency = 440  # Hz
+        t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
+        sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
+        sf.write("audio_examples/example_content.wav", sine_wave.astype(np.float32), samplerate)
+        print("Created dummy audio_examples/example_content.wav")
+    if not os.path.exists("audio_examples/example_voice.wav"):
+        # Create another dummy WAV file for voice
+        samplerate = 16000
+        duration = 1.5  # seconds
+        frequency = 880  # Hz
+        t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
+        sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
+        sf.write("audio_examples/example_voice.wav", sine_wave.astype(np.float32), samplerate)
+        print("Created dummy audio_examples/example_voice.wav")
+    iface.launch()