Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from datasets import load_dataset | |
import torch | |
import torchaudio | |
import soundfile as sf | |
import os | |
# Load models | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
# Load speaker embeddings (we'll use a sample speaker for simplicity) | |
# For real voice cloning, you'd extract embeddings from your target audio. | |
# For a quick demo, we'll use a pre-defined one from a dataset. | |
# This is just an example, a robust voice cloner would extract embeddings directly. | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
# You need to choose an appropriate speaker embedding. | |
# For a general solution, you'd typically process an *encoding* of the target voice. | |
# Let's take a sample speaker embedding. In a real application, | |
# you'd extract this from the second audio file. | |
# As a placeholder, we'll use a pre-existing one. | |
# For true voice cloning with the second audio, you'd need a model | |
# that can extract speaker embeddings from arbitrary audio. | |
# SpeechT5 itself doesn't directly provide a pre-trained model for this from an arbitrary audio. | |
# Typically, you'd use a separate speaker embedding extraction model (e.g., from ECAPA-TDNN). | |
# For this example, let's use a placeholder. | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
def voice_clone(text_audio_path, voice_audio_path): | |
""" | |
Clones the voice from voice_audio_path to speak the content of text_audio_path. | |
(Note: This implementation primarily uses text from text_audio_path and applies a generic voice. | |
True "cloning" from the *content* of text_audio_path while applying the *style* | |
of voice_audio_path for arbitrary content is more advanced. | |
Here, we assume text_audio_path provides the text to be spoken, | |
and voice_audio_path provides the *target voice characteristics*.) | |
""" | |
try: | |
# 1. Read the audio file where we want to extract the content (text or speech) | |
# For simplicity, we'll assume the text_audio_path contains speech that we want to transcribe | |
# and then re-synthesize in the style of voice_audio_path. | |
# NOTE: SpeechT5 is primarily Text-to-Speech. To get text from text_audio_path, | |
# you would need an ASR (Automatic Speech Recognition) model. | |
# For a simpler demo, let's assume text_audio_path *could be transcribed* | |
# or directly provides the text. | |
# For this example, let's assume `text_audio_path` is the source of the *text* | |
# and `voice_audio_path` is the source of the *voice characteristics*. | |
# Step 1: Read the audio file to get the content/text (A real implementation would need ASR here) | |
# As a placeholder, let's just make up some text or simplify. | |
# A more robust solution would involve: | |
# a) ASR on `text_audio_path` to get the text. | |
# b) Feature extraction from `voice_audio_path` to get an accurate speaker embedding. | |
# For now, let's simplify and make a strong assumption: | |
# The user provides a text content *implicitly* via the first audio. | |
# For ASR, we'd need another model. Let's make an assumption for the demo: | |
# we will use a fixed text, and apply the speaker embedding from `voice_audio_path`. | |
# THIS IS NOT TRUE VOICE CLONING OF *CONTENT* + *VOICE*. | |
# It's more Text-to-Speech with a specific speaker. | |
# Let's say, for demonstration, we will let the user type the text, | |
# OR pretend to extract text from `text_audio_path` and use a generic speaker embedding | |
# if we cannot extract accurate speaker embeddings from arbitrary audio using SpeechT5 directly. | |
# A more practical approach for your request: | |
# 1. User provides "source audio" (audio_1). We want to extract *content/text* from it. | |
# 2. User provides "target voice audio" (audio_2). We want to extract *speaker identity* from it. | |
# 3. Synthesize the extracted text with the extracted speaker identity. | |
# SpeechT5 doesn't have a direct "extract speaker embedding from arbitrary audio" function | |
# built into its `SpeechT5Processor`. You'd typically use a separate model for that (e.g., ECAPA-TDNN). | |
# For the sake of this demo with SpeechT5, we will use a pre-extracted speaker embedding. | |
# This means true "cloning the voice *from* the second file" (if that file is arbitrary) | |
# is complex. | |
# Let's refine the approach for a Hugging Face model combo: | |
# - Use a robust ASR model (e.g., Whisper) for the first audio to get text. | |
# - Use a speaker embedding model (e.g., from pyannote.audio or similar) for the second audio. | |
# - Use SpeechT5 for TTS. | |
# Given the prompt, "تقلید صدای فایل اول را از فایل دوم", it implies: | |
# File 1: Provides the *content* (what is said). | |
# File 2: Provides the *voice style/identity*. | |
# For a simplified demo *without* an ASR model and a dedicated speaker embedding extractor: | |
# We will prompt the user for the text they want to say. | |
# The second audio will be *assumed* to be able to provide a speaker embedding. | |
# For a robust solution, you'd add: | |
# from transformers import pipeline | |
# asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2") | |
# target_text = asr_pipeline(text_audio_path)["text"] | |
# And for speaker embedding extraction from voice_audio_path: | |
# This is the trickiest part with SpeechT5 directly. It expects a pre-computed xvector. | |
# You could use a library like `pyannote.audio` to get xvectors: | |
# from pyannote.audio import Inference | |
# speaker_diarization_model = Inference("pyannote/[email protected]", device=0) | |
# embeddings = speaker_diarization_model.get_embedding(voice_audio_path) | |
# speaker_embeddings = torch.mean(embeddings, dim=0).unsqueeze(0) # or careful selection | |
# Let's modify the Gradio interface to take TEXT input AND two audios | |
# with clearer roles, or make a very strong assumption. | |
# For a demonstration, let's assume `voice_audio_path` provides speaker characteristics, | |
# AND we will use a generic default text, OR we ask for text explicitly. | |
# Let's try to extract speaker embeddings using a more general method if possible, | |
# but this is beyond SpeechT5's direct scope. | |
# For demonstration, we will use a pre-defined speaker embedding. | |
# To truly take it from `voice_audio_path`, you'd need an `inference` model | |
# that generates these embeddings from arbitrary audio which is not part of `SpeechT5ForTextToSpeech`. | |
# If you want to use the *content* of `text_audio_path` AND the *voice* of `voice_audio_path`: | |
# The current SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file. | |
# So, we'll take a simplified approach for the demo: | |
# 1. We will use a *fixed text* (e.g., "Hello, this is a test of voice cloning.") | |
# 2. We will attempt to use a speaker embedding. Since SpeechT5 needs a pre-computed x-vector, | |
# and we can't easily extract it from an arbitrary `voice_audio_path` *within this simple setup*, | |
# we will stick to the generic `speaker_embeddings` loaded earlier. | |
# This means true "voice cloning based on the second input audio" is limited by the model's structure. | |
# Let's reconsider the prompt: "تقلید صدای فایل اول را از فایل دوم". | |
# This implies: | |
# - File 1: The *source content* (what to say). | |
# - File 2: The *target voice* (how to say it). | |
# To do this properly, we need: | |
# 1. ASR model to transcribe File 1. | |
# 2. Speaker embedding model to extract embeddings from File 2. | |
# 3. SpeechT5 to synthesize the text from 1 with embeddings from 2. | |
# Let's use `pyaudio` for reading and `transformers` for ASR. | |
# This will significantly increase the `requirements.txt`. | |
# New plan: | |
# Assume `text_audio_path` (File 1) is where we get the *text to speak*. | |
# Assume `voice_audio_path` (File 2) is where we get the *speaker's voice characteristics*. | |
# We need an ASR for File 1 and a speaker embedding extractor for File 2. | |
# Let's install Whisper for ASR. | |
# pip install git+https://github.com/huggingface/transformers.git openai-whisper optimum accelerate | |
from transformers import pipeline | |
# Initialize ASR pipeline | |
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=0 if torch.cuda.is_available() else -1) | |
# Transcribe the content from the first audio file | |
print(f"Transcribing {text_audio_path}...") | |
transcription_result = asr_pipeline(text_audio_path) | |
target_text = transcription_result["text"] | |
print(f"Transcribed text: {target_text}") | |
if not target_text: | |
return None, "No discernible text extracted from the first audio file. Please try a clearer audio." | |
# Extract speaker embeddings from the second audio file | |
# This is the most challenging part with SpeechT5 directly. | |
# Although SpeechT5 uses x-vectors, it doesn't provide a direct inference method | |
# to extract them from arbitrary audio files for new speakers. | |
# The 'speaker_embeddings' used in examples are usually pre-extracted or come from a dataset. | |
# For a robust solution, you'd need a separate model like pyannote.audio's speaker embedding model. | |
# For this demo, let's use a simpler approach: | |
# We will try to use the `voice_audio_path` to generate a "speaker embedding" | |
# but it won't be as precise as a dedicated model. This is where the | |
# "low error" might be compromised if the speaker embedding isn't accurate. | |
# For a proper solution, you'd need something like: | |
# from pyannote.audio import Inference as SpeakerInference | |
# speaker_embedding_model = SpeakerInference("pyannote/embedding", device=0) | |
# waveform, sample_rate = torchaudio.load(voice_audio_path) | |
# # Resample if necessary for the speaker embedding model | |
# if sample_rate != speaker_embedding_model.sample_rate: | |
# resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=speaker_embedding_model.sample_rate) | |
# waveform = resampler(waveform) | |
# speaker_embeddings_from_file = speaker_embedding_model(waveform.unsqueeze(0)).squeeze(0) # Assuming single speaker | |
# speaker_embeddings = speaker_embeddings_from_file.unsqueeze(0) | |
# Given the constraint to keep it simpler with available HuggingFace models (SpeechT5, Whisper), | |
# let's acknowledge the limitation: directly extracting robust speaker embeddings from *arbitrary* audio | |
# for SpeechT5 is not straightforward without more models. | |
# We will use the *pre-defined* `speaker_embeddings` for the demo, which means | |
# the "voice cloning" from `voice_audio_path` is highly simplified/simulated using a generic voice. | |
# If `speaker_embeddings` were accurately derived from `voice_audio_path`, this would be true cloning. | |
# For a basic demo, let's proceed with the generic speaker embedding to show the TTS part. | |
# To truly use the voice from the second file, we need a way to extract x-vectors. | |
# FOR THIS DEMO, WE WILL USE THE PREDEFINED SPEAKER EMBEDDING. | |
# THE SECOND AUDIO FILE IS CURRENTLY NOT USED FOR SPEAKER EMBEDDING EXTRACTION. | |
# This is a critical limitation for "voice cloning" *from* an arbitrary file. | |
# If you specifically want to use the voice of the *second* file, | |
# the recommended approach would be to extract x-vectors using a separate | |
# model (e.g., from `pyannote.audio` or `speechbrain`). | |
# Since `pyannote.audio` might add complexity to `requirements.txt` | |
# and device handling, let's keep it with what's easily integrated by `transformers`. | |
# For an actual voice cloning, the second audio file provides the speaker's timbre. | |
# SpeechT5 accepts `speaker_embeddings`. How to get these from an arbitrary MP3? | |
# This is the core problem. The models `google/speakertype` or `s3prl/pretrain_ecapa_tdnn` | |
# can extract these. If we add `speechbrain` to requirements, we can do it. | |
# Let's add `speechbrain` for speaker embedding extraction. | |
# pip install speechbrain | |
from speechbrain.inference.SpeakerEmbedding import SpeakerEmbedding as SpeechBrainSpeakerEmbedding | |
# Initialize Speaker Embedding Model | |
speaker_embedding_model_sb = SpeechBrainSpeakerEmbedding.from_hparams( | |
source="speechbrain/spkrec-ecapa-tdnn", | |
savedir="pretrained_models/spkrec-ecapa-tdnn", | |
run_opts={"device":"cuda" if torch.cuda.is_available() else "cpu"} | |
) | |
print(f"Extracting speaker embedding from {voice_audio_path}...") | |
# Load the second audio file for speaker embedding | |
# SpeechBrain's `SpeakerEmbedding` expects a waveform tensor. | |
# Load and resample if necessary | |
voice_waveform, sr = torchaudio.load(voice_audio_path) | |
# Need to ensure correct sampling rate and mono channel for `speechbrain` | |
if voice_waveform.shape[0] > 1: # Convert to mono | |
voice_waveform = voice_waveform.mean(dim=0, keepdim=True) | |
# SpeechBrain's model expects a specific sampling rate (usually 16kHz) | |
if sr != 16000: | |
resampler = torchaudio.transforms.Resample(sr, 16000) | |
voice_waveform = resampler(voice_waveform) | |
# Extract the speaker embedding | |
speaker_embeddings_from_voice_audio = speaker_embedding_model_sb.encode_batch(voice_waveform).squeeze(0) | |
# SpeechT5 expects embeddings with shape (1, 512) for a single speaker | |
speaker_embeddings = speaker_embeddings_from_voice_audio.unsqueeze(0) | |
print("Speaker embedding extracted.") | |
# Synthesize speech using SpeechT5 | |
inputs = processor(text=target_text, return_tensors="pt") | |
# Move inputs and speaker_embeddings to the same device as the model | |
if torch.cuda.is_available(): | |
inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
model.to("cuda") | |
vocoder.to("cuda") | |
speaker_embeddings = speaker_embeddings.to("cuda") | |
print("Generating speech...") | |
# Generate speech | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
# Normalize to be within [-1, 1] for audio playback | |
speech = speech.cpu().numpy() | |
max_val = max(abs(speech.min()), abs(speech.max())) | |
if max_val > 1.0: | |
speech = speech / max_val | |
print("Speech generated. Saving to temporary file...") | |
# Save the generated audio to a temporary file | |
output_path = "cloned_voice_output.wav" | |
sf.write(output_path, speech, vocoder.config.sampling_rate) | |
return output_path, "Voice cloning successful!" | |
except Exception as e: | |
error_message = f"An error occurred: {e}" | |
print(error_message) | |
import traceback | |
traceback.print_exc() | |
return None, error_message | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=voice_clone, | |
inputs=[ | |
gr.Audio(type="filepath", label="فایل صوتی اول (برای استخراج متن)", source="upload"), | |
gr.Audio(type="filepath", label="فایل صوتی دوم (برای تقلید صدا)", source="upload") | |
], | |
outputs=[ | |
gr.Audio(label="صدای شبیه سازی شده"), | |
gr.Textbox(label="وضعیت") | |
], | |
title="Voice Cloner (تقلید صدا) با HuggingFace", | |
description="فایل صوتی اول را آپلود کنید تا متن آن استخراج شود. فایل صوتی دوم را آپلود کنید تا صدای آن تقلید شود و متن فایل اول با صدای فایل دوم تولید شود. (پشتیبانی از mp3/wav)", | |
examples=[ | |
[ | |
"audio_examples/example_content.wav", # Example for content (what to say) | |
"audio_examples/example_voice.wav" # Example for voice (how to say it) | |
] | |
] | |
) | |
if __name__ == "__main__": | |
# Create an example directory and dummy files if they don't exist | |
os.makedirs("audio_examples", exist_ok=True) | |
if not os.path.exists("audio_examples/example_content.wav"): | |
# Create a dummy WAV file for content | |
import numpy as np | |
samplerate = 16000 | |
duration = 2.0 # seconds | |
frequency = 440 # Hz | |
t = np.linspace(0., duration, int(samplerate * duration), endpoint=False) | |
sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t) | |
sf.write("audio_examples/example_content.wav", sine_wave.astype(np.float32), samplerate) | |
print("Created dummy audio_examples/example_content.wav") | |
if not os.path.exists("audio_examples/example_voice.wav"): | |
# Create another dummy WAV file for voice | |
samplerate = 16000 | |
duration = 1.5 # seconds | |
frequency = 880 # Hz | |
t = np.linspace(0., duration, int(samplerate * duration), endpoint=False) | |
sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t) | |
sf.write("audio_examples/example_voice.wav", sine_wave.astype(np.float32), samplerate) | |
print("Created dummy audio_examples/example_voice.wav") | |
iface.launch() | |