Spaces:

suprimedev
/

cloner

Sleeping

App Files Files Community

cloner / app.py

suprimedev

Update app.py

94c2b30 verified 3 months ago

raw

history blame

18.1 kB

	import gradio as gr
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset
	import torch
	import torchaudio
	import soundfile as sf
	import os

	# Load models
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Load speaker embeddings (we'll use a sample speaker for simplicity)
	# For real voice cloning, you'd extract embeddings from your target audio.
	# For a quick demo, we'll use a pre-defined one from a dataset.
	# This is just an example, a robust voice cloner would extract embeddings directly.
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

	# You need to choose an appropriate speaker embedding.
	# For a general solution, you'd typically process an encoding of the target voice.
	# Let's take a sample speaker embedding. In a real application,
	# you'd extract this from the second audio file.
	# As a placeholder, we'll use a pre-existing one.
	# For true voice cloning with the second audio, you'd need a model
	# that can extract speaker embeddings from arbitrary audio.
	# SpeechT5 itself doesn't directly provide a pre-trained model for this from an arbitrary audio.
	# Typically, you'd use a separate speaker embedding extraction model (e.g., from ECAPA-TDNN).
	# For this example, let's use a placeholder.
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	def voice_clone(text_audio_path, voice_audio_path):
	"""
	Clones the voice from voice_audio_path to speak the content of text_audio_path.
	(Note: This implementation primarily uses text from text_audio_path and applies a generic voice.
	True "cloning" from the content of text_audio_path while applying the style
	of voice_audio_path for arbitrary content is more advanced.
	Here, we assume text_audio_path provides the text to be spoken,
	and voice_audio_path provides the target voice characteristics.)
	"""
	try:
	# 1. Read the audio file where we want to extract the content (text or speech)
	# For simplicity, we'll assume the text_audio_path contains speech that we want to transcribe
	# and then re-synthesize in the style of voice_audio_path.
	# NOTE: SpeechT5 is primarily Text-to-Speech. To get text from text_audio_path,
	# you would need an ASR (Automatic Speech Recognition) model.
	# For a simpler demo, let's assume text_audio_path could be transcribed
	# or directly provides the text.

	# For this example, let's assume `text_audio_path` is the source of the text
	# and `voice_audio_path` is the source of the voice characteristics.

	# Step 1: Read the audio file to get the content/text (A real implementation would need ASR here)
	# As a placeholder, let's just make up some text or simplify.
	# A more robust solution would involve:
	# a) ASR on `text_audio_path` to get the text.
	# b) Feature extraction from `voice_audio_path` to get an accurate speaker embedding.

	# For now, let's simplify and make a strong assumption:
	# The user provides a text content implicitly via the first audio.
	# For ASR, we'd need another model. Let's make an assumption for the demo:
	# we will use a fixed text, and apply the speaker embedding from `voice_audio_path`.
	# THIS IS NOT TRUE VOICE CLONING OF CONTENT + VOICE.
	# It's more Text-to-Speech with a specific speaker.

	# Let's say, for demonstration, we will let the user type the text,
	# OR pretend to extract text from `text_audio_path` and use a generic speaker embedding
	# if we cannot extract accurate speaker embeddings from arbitrary audio using SpeechT5 directly.

	# A more practical approach for your request:
	# 1. User provides "source audio" (audio_1). We want to extract content/text from it.
	# 2. User provides "target voice audio" (audio_2). We want to extract speaker identity from it.
	# 3. Synthesize the extracted text with the extracted speaker identity.

	# SpeechT5 doesn't have a direct "extract speaker embedding from arbitrary audio" function
	# built into its `SpeechT5Processor`. You'd typically use a separate model for that (e.g., ECAPA-TDNN).
	# For the sake of this demo with SpeechT5, we will use a pre-extracted speaker embedding.
	# This means true "cloning the voice from the second file" (if that file is arbitrary)
	# is complex.

	# Let's refine the approach for a Hugging Face model combo:
	# - Use a robust ASR model (e.g., Whisper) for the first audio to get text.
	# - Use a speaker embedding model (e.g., from pyannote.audio or similar) for the second audio.
	# - Use SpeechT5 for TTS.

	# Given the prompt, "تقلید صدای فایل اول را از فایل دوم", it implies:
	# File 1: Provides the content (what is said).
	# File 2: Provides the voice style/identity.

	# For a simplified demo without an ASR model and a dedicated speaker embedding extractor:
	# We will prompt the user for the text they want to say.
	# The second audio will be assumed to be able to provide a speaker embedding.

	# For a robust solution, you'd add:
	# from transformers import pipeline
	# asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
	# target_text = asr_pipeline(text_audio_path)["text"]

	# And for speaker embedding extraction from voice_audio_path:
	# This is the trickiest part with SpeechT5 directly. It expects a pre-computed xvector.
	# You could use a library like `pyannote.audio` to get xvectors:
	# from pyannote.audio import Inference
	# speaker_diarization_model = Inference("pyannote/[email protected]", device=0)
	# embeddings = speaker_diarization_model.get_embedding(voice_audio_path)
	# speaker_embeddings = torch.mean(embeddings, dim=0).unsqueeze(0) # or careful selection

	# Let's modify the Gradio interface to take TEXT input AND two audios
	# with clearer roles, or make a very strong assumption.

	# For a demonstration, let's assume `voice_audio_path` provides speaker characteristics,
	# AND we will use a generic default text, OR we ask for text explicitly.

	# Let's try to extract speaker embeddings using a more general method if possible,
	# but this is beyond SpeechT5's direct scope.
	# For demonstration, we will use a pre-defined speaker embedding.
	# To truly take it from `voice_audio_path`, you'd need an `inference` model
	# that generates these embeddings from arbitrary audio which is not part of `SpeechT5ForTextToSpeech`.

	# If you want to use the content of `text_audio_path` AND the voice of `voice_audio_path`:
	# The current SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
	# So, we'll take a simplified approach for the demo:
	# 1. We will use a fixed text (e.g., "Hello, this is a test of voice cloning.")
	# 2. We will attempt to use a speaker embedding. Since SpeechT5 needs a pre-computed x-vector,
	# and we can't easily extract it from an arbitrary `voice_audio_path` within this simple setup,
	# we will stick to the generic `speaker_embeddings` loaded earlier.
	# This means true "voice cloning based on the second input audio" is limited by the model's structure.

	# Let's reconsider the prompt: "تقلید صدای فایل اول را از فایل دوم".
	# This implies:
	# - File 1: The source content (what to say).
	# - File 2: The target voice (how to say it).

	# To do this properly, we need:
	# 1. ASR model to transcribe File 1.
	# 2. Speaker embedding model to extract embeddings from File 2.
	# 3. SpeechT5 to synthesize the text from 1 with embeddings from 2.

	# Let's use `pyaudio` for reading and `transformers` for ASR.
	# This will significantly increase the `requirements.txt`.

	# New plan:
	# Assume `text_audio_path` (File 1) is where we get the text to speak.
	# Assume `voice_audio_path` (File 2) is where we get the speaker's voice characteristics.
	# We need an ASR for File 1 and a speaker embedding extractor for File 2.

	# Let's install Whisper for ASR.
	# pip install git+https://github.com/huggingface/transformers.git openai-whisper optimum accelerate

	from transformers import pipeline

	# Initialize ASR pipeline
	asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=0 if torch.cuda.is_available() else -1)

	# Transcribe the content from the first audio file
	print(f"Transcribing {text_audio_path}...")
	transcription_result = asr_pipeline(text_audio_path)
	target_text = transcription_result["text"]
	print(f"Transcribed text: {target_text}")

	if not target_text:
	return None, "No discernible text extracted from the first audio file. Please try a clearer audio."

	# Extract speaker embeddings from the second audio file
	# This is the most challenging part with SpeechT5 directly.
	# Although SpeechT5 uses x-vectors, it doesn't provide a direct inference method
	# to extract them from arbitrary audio files for new speakers.
	# The 'speaker_embeddings' used in examples are usually pre-extracted or come from a dataset.
	# For a robust solution, you'd need a separate model like pyannote.audio's speaker embedding model.
	# For this demo, let's use a simpler approach:
	# We will try to use the `voice_audio_path` to generate a "speaker embedding"
	# but it won't be as precise as a dedicated model. This is where the
	# "low error" might be compromised if the speaker embedding isn't accurate.

	# For a proper solution, you'd need something like:
	# from pyannote.audio import Inference as SpeakerInference
	# speaker_embedding_model = SpeakerInference("pyannote/embedding", device=0)
	# waveform, sample_rate = torchaudio.load(voice_audio_path)
	# # Resample if necessary for the speaker embedding model
	# if sample_rate != speaker_embedding_model.sample_rate:
	# resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=speaker_embedding_model.sample_rate)
	# waveform = resampler(waveform)
	# speaker_embeddings_from_file = speaker_embedding_model(waveform.unsqueeze(0)).squeeze(0) # Assuming single speaker
	# speaker_embeddings = speaker_embeddings_from_file.unsqueeze(0)

	# Given the constraint to keep it simpler with available HuggingFace models (SpeechT5, Whisper),
	# let's acknowledge the limitation: directly extracting robust speaker embeddings from arbitrary audio
	# for SpeechT5 is not straightforward without more models.
	# We will use the pre-defined `speaker_embeddings` for the demo, which means
	# the "voice cloning" from `voice_audio_path` is highly simplified/simulated using a generic voice.

	# If `speaker_embeddings` were accurately derived from `voice_audio_path`, this would be true cloning.
	# For a basic demo, let's proceed with the generic speaker embedding to show the TTS part.
	# To truly use the voice from the second file, we need a way to extract x-vectors.
	# FOR THIS DEMO, WE WILL USE THE PREDEFINED SPEAKER EMBEDDING.
	# THE SECOND AUDIO FILE IS CURRENTLY NOT USED FOR SPEAKER EMBEDDING EXTRACTION.
	# This is a critical limitation for "voice cloning" from an arbitrary file.

	# If you specifically want to use the voice of the second file,
	# the recommended approach would be to extract x-vectors using a separate
	# model (e.g., from `pyannote.audio` or `speechbrain`).
	# Since `pyannote.audio` might add complexity to `requirements.txt`
	# and device handling, let's keep it with what's easily integrated by `transformers`.

	# For an actual voice cloning, the second audio file provides the speaker's timbre.
	# SpeechT5 accepts `speaker_embeddings`. How to get these from an arbitrary MP3?
	# This is the core problem. The models `google/speakertype` or `s3prl/pretrain_ecapa_tdnn`
	# can extract these. If we add `speechbrain` to requirements, we can do it.

	# Let's add `speechbrain` for speaker embedding extraction.
	# pip install speechbrain

	from speechbrain.inference.SpeakerEmbedding import SpeakerEmbedding as SpeechBrainSpeakerEmbedding

	# Initialize Speaker Embedding Model
	speaker_embedding_model_sb = SpeechBrainSpeakerEmbedding.from_hparams(
	source="speechbrain/spkrec-ecapa-tdnn",
	savedir="pretrained_models/spkrec-ecapa-tdnn",
	run_opts={"device":"cuda" if torch.cuda.is_available() else "cpu"}
	)

	print(f"Extracting speaker embedding from {voice_audio_path}...")
	# Load the second audio file for speaker embedding
	# SpeechBrain's `SpeakerEmbedding` expects a waveform tensor.
	# Load and resample if necessary
	voice_waveform, sr = torchaudio.load(voice_audio_path)

	# Need to ensure correct sampling rate and mono channel for `speechbrain`
	if voice_waveform.shape[0] > 1: # Convert to mono
	voice_waveform = voice_waveform.mean(dim=0, keepdim=True)

	# SpeechBrain's model expects a specific sampling rate (usually 16kHz)
	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	voice_waveform = resampler(voice_waveform)

	# Extract the speaker embedding
	speaker_embeddings_from_voice_audio = speaker_embedding_model_sb.encode_batch(voice_waveform).squeeze(0)
	# SpeechT5 expects embeddings with shape (1, 512) for a single speaker
	speaker_embeddings = speaker_embeddings_from_voice_audio.unsqueeze(0)
	print("Speaker embedding extracted.")

	# Synthesize speech using SpeechT5
	inputs = processor(text=target_text, return_tensors="pt")

	# Move inputs and speaker_embeddings to the same device as the model
	if torch.cuda.is_available():
	inputs = {k: v.to("cuda") for k, v in inputs.items()}
	model.to("cuda")
	vocoder.to("cuda")
	speaker_embeddings = speaker_embeddings.to("cuda")

	print("Generating speech...")



	# Generate speech
	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

	# Normalize to be within [-1, 1] for audio playback
	speech = speech.cpu().numpy()
	max_val = max(abs(speech.min()), abs(speech.max()))
	if max_val > 1.0:
	speech = speech / max_val

	print("Speech generated. Saving to temporary file...")

	# Save the generated audio to a temporary file
	output_path = "cloned_voice_output.wav"
	sf.write(output_path, speech, vocoder.config.sampling_rate)

	return output_path, "Voice cloning successful!"

	except Exception as e:
	error_message = f"An error occurred: {e}"
	print(error_message)
	import traceback
	traceback.print_exc()
	return None, error_message

	# Gradio Interface
	iface = gr.Interface(
	fn=voice_clone,
	inputs=[
	gr.Audio(type="filepath", label="فایل صوتی اول (برای استخراج متن)", source="upload"),
	gr.Audio(type="filepath", label="فایل صوتی دوم (برای تقلید صدا)", source="upload")
	],
	outputs=[
	gr.Audio(label="صدای شبیه سازی شده"),
	gr.Textbox(label="وضعیت")
	],
	title="Voice Cloner (تقلید صدا) با HuggingFace",
	description="فایل صوتی اول را آپلود کنید تا متن آن استخراج شود. فایل صوتی دوم را آپلود کنید تا صدای آن تقلید شود و متن فایل اول با صدای فایل دوم تولید شود. (پشتیبانی از mp3/wav)",
	examples=[
	[
	"audio_examples/example_content.wav", # Example for content (what to say)
	"audio_examples/example_voice.wav" # Example for voice (how to say it)
	]
	]
	)

	if __name__ == "__main__":
	# Create an example directory and dummy files if they don't exist
	os.makedirs("audio_examples", exist_ok=True)
	if not os.path.exists("audio_examples/example_content.wav"):
	# Create a dummy WAV file for content
	import numpy as np
	samplerate = 16000
	duration = 2.0 # seconds
	frequency = 440 # Hz
	t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
	sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
	sf.write("audio_examples/example_content.wav", sine_wave.astype(np.float32), samplerate)
	print("Created dummy audio_examples/example_content.wav")

	if not os.path.exists("audio_examples/example_voice.wav"):
	# Create another dummy WAV file for voice
	samplerate = 16000
	duration = 1.5 # seconds
	frequency = 880 # Hz
	t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
	sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
	sf.write("audio_examples/example_voice.wav", sine_wave.astype(np.float32), samplerate)
	print("Created dummy audio_examples/example_voice.wav")

	iface.launch()