Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,326 +1,109 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
3 |
-
from datasets import load_dataset
|
4 |
-
import torch
|
5 |
import torchaudio
|
6 |
-
|
|
|
|
|
|
|
|
|
7 |
import os
|
|
|
8 |
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
-
|
14 |
-
# Load speaker embeddings (we'll use a sample speaker for simplicity)
|
15 |
-
# For real voice cloning, you'd extract embeddings from your target audio.
|
16 |
-
# For a quick demo, we'll use a pre-defined one from a dataset.
|
17 |
-
# This is just an example, a robust voice cloner would extract embeddings directly.
|
18 |
-
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
19 |
-
|
20 |
-
# You need to choose an appropriate speaker embedding.
|
21 |
-
# For a general solution, you'd typically process an *encoding* of the target voice.
|
22 |
-
# Let's take a sample speaker embedding. In a real application,
|
23 |
-
# you'd extract this from the second audio file.
|
24 |
-
# As a placeholder, we'll use a pre-existing one.
|
25 |
-
# For true voice cloning with the second audio, you'd need a model
|
26 |
-
# that can extract speaker embeddings from arbitrary audio.
|
27 |
-
# SpeechT5 itself doesn't directly provide a pre-trained model for this from an arbitrary audio.
|
28 |
-
# Typically, you'd use a separate speaker embedding extraction model (e.g., from ECAPA-TDNN).
|
29 |
-
# For this example, let's use a placeholder.
|
30 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
31 |
-
|
32 |
-
|
33 |
-
def voice_clone(text_audio_path, voice_audio_path):
|
34 |
-
"""
|
35 |
-
Clones the voice from voice_audio_path to speak the content of text_audio_path.
|
36 |
-
(Note: This implementation primarily uses text from text_audio_path and applies a generic voice.
|
37 |
-
True "cloning" from the *content* of text_audio_path while applying the *style*
|
38 |
-
of voice_audio_path for arbitrary content is more advanced.
|
39 |
-
Here, we assume text_audio_path provides the text to be spoken,
|
40 |
-
and voice_audio_path provides the *target voice characteristics*.)
|
41 |
-
"""
|
42 |
-
try:
|
43 |
-
# 1. Read the audio file where we want to extract the content (text or speech)
|
44 |
-
# For simplicity, we'll assume the text_audio_path contains speech that we want to transcribe
|
45 |
-
# and then re-synthesize in the style of voice_audio_path.
|
46 |
-
# NOTE: SpeechT5 is primarily Text-to-Speech. To get text from text_audio_path,
|
47 |
-
# you would need an ASR (Automatic Speech Recognition) model.
|
48 |
-
# For a simpler demo, let's assume text_audio_path *could be transcribed*
|
49 |
-
# or directly provides the text.
|
50 |
-
|
51 |
-
# For this example, let's assume `text_audio_path` is the source of the *text*
|
52 |
-
# and `voice_audio_path` is the source of the *voice characteristics*.
|
53 |
-
|
54 |
-
# Step 1: Read the audio file to get the content/text (A real implementation would need ASR here)
|
55 |
-
# As a placeholder, let's just make up some text or simplify.
|
56 |
-
# A more robust solution would involve:
|
57 |
-
# a) ASR on `text_audio_path` to get the text.
|
58 |
-
# b) Feature extraction from `voice_audio_path` to get an accurate speaker embedding.
|
59 |
-
|
60 |
-
# For now, let's simplify and make a strong assumption:
|
61 |
-
# The user provides a text content *implicitly* via the first audio.
|
62 |
-
# For ASR, we'd need another model. Let's make an assumption for the demo:
|
63 |
-
# we will use a fixed text, and apply the speaker embedding from `voice_audio_path`.
|
64 |
-
# THIS IS NOT TRUE VOICE CLONING OF *CONTENT* + *VOICE*.
|
65 |
-
# It's more Text-to-Speech with a specific speaker.
|
66 |
-
|
67 |
-
# Let's say, for demonstration, we will let the user type the text,
|
68 |
-
# OR pretend to extract text from `text_audio_path` and use a generic speaker embedding
|
69 |
-
# if we cannot extract accurate speaker embeddings from arbitrary audio using SpeechT5 directly.
|
70 |
-
|
71 |
-
# A more practical approach for your request:
|
72 |
-
# 1. User provides "source audio" (audio_1). We want to extract *content/text* from it.
|
73 |
-
# 2. User provides "target voice audio" (audio_2). We want to extract *speaker identity* from it.
|
74 |
-
# 3. Synthesize the extracted text with the extracted speaker identity.
|
75 |
-
|
76 |
-
# SpeechT5 doesn't have a direct "extract speaker embedding from arbitrary audio" function
|
77 |
-
# built into its `SpeechT5Processor`. You'd typically use a separate model for that (e.g., ECAPA-TDNN).
|
78 |
-
# For the sake of this demo with SpeechT5, we will use a pre-extracted speaker embedding.
|
79 |
-
# This means true "cloning the voice *from* the second file" (if that file is arbitrary)
|
80 |
-
# is complex.
|
81 |
-
|
82 |
-
# Let's refine the approach for a Hugging Face model combo:
|
83 |
-
# - Use a robust ASR model (e.g., Whisper) for the first audio to get text.
|
84 |
-
# - Use a speaker embedding model (e.g., from pyannote.audio or similar) for the second audio.
|
85 |
-
# - Use SpeechT5 for TTS.
|
86 |
-
|
87 |
-
# Given the prompt, "تقلید صدای فایل اول را از فایل دوم", it implies:
|
88 |
-
# File 1: Provides the *content* (what is said).
|
89 |
-
# File 2: Provides the *voice style/identity*.
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
# Let's try to extract speaker embeddings using a more general method if possible,
|
115 |
-
# but this is beyond SpeechT5's direct scope.
|
116 |
-
# For demonstration, we will use a pre-defined speaker embedding.
|
117 |
-
# To truly take it from `voice_audio_path`, you'd need an `inference` model
|
118 |
-
# that generates these embeddings from arbitrary audio which is not part of `SpeechT5ForTextToSpeech`.
|
119 |
-
|
120 |
-
# If you want to use the *content* of `text_audio_path` AND the *voice* of `voice_audio_path`:
|
121 |
-
# The current SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
|
122 |
-
# So, we'll take a simplified approach for the demo:
|
123 |
-
# 1. We will use a *fixed text* (e.g., "Hello, this is a test of voice cloning.")
|
124 |
-
# 2. We will attempt to use a speaker embedding. Since SpeechT5 needs a pre-computed x-vector,
|
125 |
-
# and we can't easily extract it from an arbitrary `voice_audio_path` *within this simple setup*,
|
126 |
-
# we will stick to the generic `speaker_embeddings` loaded earlier.
|
127 |
-
# This means true "voice cloning based on the second input audio" is limited by the model's structure.
|
128 |
-
|
129 |
-
# Let's reconsider the prompt: "تقلید صدای فایل اول را از فایل دوم".
|
130 |
-
# This implies:
|
131 |
-
# - File 1: The *source content* (what to say).
|
132 |
-
# - File 2: The *target voice* (how to say it).
|
133 |
-
|
134 |
-
# To do this properly, we need:
|
135 |
-
# 1. ASR model to transcribe File 1.
|
136 |
-
# 2. Speaker embedding model to extract embeddings from File 2.
|
137 |
-
# 3. SpeechT5 to synthesize the text from 1 with embeddings from 2.
|
138 |
-
|
139 |
-
# Let's use `pyaudio` for reading and `transformers` for ASR.
|
140 |
-
# This will significantly increase the `requirements.txt`.
|
141 |
-
|
142 |
-
# New plan:
|
143 |
-
# Assume `text_audio_path` (File 1) is where we get the *text to speak*.
|
144 |
-
# Assume `voice_audio_path` (File 2) is where we get the *speaker's voice characteristics*.
|
145 |
-
# We need an ASR for File 1 and a speaker embedding extractor for File 2.
|
146 |
-
|
147 |
-
# Let's install Whisper for ASR.
|
148 |
-
# pip install git+https://github.com/huggingface/transformers.git openai-whisper optimum accelerate
|
149 |
-
|
150 |
-
from transformers import pipeline
|
151 |
-
|
152 |
-
# Initialize ASR pipeline
|
153 |
-
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=0 if torch.cuda.is_available() else -1)
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
# This is the most challenging part with SpeechT5 directly.
|
166 |
-
# Although SpeechT5 uses x-vectors, it doesn't provide a direct inference method
|
167 |
-
# to extract them from arbitrary audio files for new speakers.
|
168 |
-
# The 'speaker_embeddings' used in examples are usually pre-extracted or come from a dataset.
|
169 |
-
# For a robust solution, you'd need a separate model like pyannote.audio's speaker embedding model.
|
170 |
-
# For this demo, let's use a simpler approach:
|
171 |
-
# We will try to use the `voice_audio_path` to generate a "speaker embedding"
|
172 |
-
# but it won't be as precise as a dedicated model. This is where the
|
173 |
-
# "low error" might be compromised if the speaker embedding isn't accurate.
|
174 |
-
|
175 |
-
# For a proper solution, you'd need something like:
|
176 |
-
# from pyannote.audio import Inference as SpeakerInference
|
177 |
-
# speaker_embedding_model = SpeakerInference("pyannote/embedding", device=0)
|
178 |
-
# waveform, sample_rate = torchaudio.load(voice_audio_path)
|
179 |
-
# # Resample if necessary for the speaker embedding model
|
180 |
-
# if sample_rate != speaker_embedding_model.sample_rate:
|
181 |
-
# resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=speaker_embedding_model.sample_rate)
|
182 |
-
# waveform = resampler(waveform)
|
183 |
-
# speaker_embeddings_from_file = speaker_embedding_model(waveform.unsqueeze(0)).squeeze(0) # Assuming single speaker
|
184 |
-
# speaker_embeddings = speaker_embeddings_from_file.unsqueeze(0)
|
185 |
-
|
186 |
-
# Given the constraint to keep it simpler with available HuggingFace models (SpeechT5, Whisper),
|
187 |
-
# let's acknowledge the limitation: directly extracting robust speaker embeddings from *arbitrary* audio
|
188 |
-
# for SpeechT5 is not straightforward without more models.
|
189 |
-
# We will use the *pre-defined* `speaker_embeddings` for the demo, which means
|
190 |
-
# the "voice cloning" from `voice_audio_path` is highly simplified/simulated using a generic voice.
|
191 |
-
|
192 |
-
# If `speaker_embeddings` were accurately derived from `voice_audio_path`, this would be true cloning.
|
193 |
-
# For a basic demo, let's proceed with the generic speaker embedding to show the TTS part.
|
194 |
-
# To truly use the voice from the second file, we need a way to extract x-vectors.
|
195 |
-
# FOR THIS DEMO, WE WILL USE THE PREDEFINED SPEAKER EMBEDDING.
|
196 |
-
# THE SECOND AUDIO FILE IS CURRENTLY NOT USED FOR SPEAKER EMBEDDING EXTRACTION.
|
197 |
-
# This is a critical limitation for "voice cloning" *from* an arbitrary file.
|
198 |
-
|
199 |
-
# If you specifically want to use the voice of the *second* file,
|
200 |
-
# the recommended approach would be to extract x-vectors using a separate
|
201 |
-
# model (e.g., from `pyannote.audio` or `speechbrain`).
|
202 |
-
# Since `pyannote.audio` might add complexity to `requirements.txt`
|
203 |
-
# and device handling, let's keep it with what's easily integrated by `transformers`.
|
204 |
-
|
205 |
-
# For an actual voice cloning, the second audio file provides the speaker's timbre.
|
206 |
-
# SpeechT5 accepts `speaker_embeddings`. How to get these from an arbitrary MP3?
|
207 |
-
# This is the core problem. The models `google/speakertype` or `s3prl/pretrain_ecapa_tdnn`
|
208 |
-
# can extract these. If we add `speechbrain` to requirements, we can do it.
|
209 |
-
|
210 |
-
# Let's add `speechbrain` for speaker embedding extraction.
|
211 |
-
# pip install speechbrain
|
212 |
-
|
213 |
-
from speechbrain.inference.SpeakerEmbedding import SpeakerEmbedding as SpeechBrainSpeakerEmbedding
|
214 |
-
|
215 |
-
# Initialize Speaker Embedding Model
|
216 |
-
speaker_embedding_model_sb = SpeechBrainSpeakerEmbedding.from_hparams(
|
217 |
-
source="speechbrain/spkrec-ecapa-tdnn",
|
218 |
-
savedir="pretrained_models/spkrec-ecapa-tdnn",
|
219 |
-
run_opts={"device":"cuda" if torch.cuda.is_available() else "cpu"}
|
220 |
-
)
|
221 |
-
|
222 |
-
print(f"Extracting speaker embedding from {voice_audio_path}...")
|
223 |
-
# Load the second audio file for speaker embedding
|
224 |
-
# SpeechBrain's `SpeakerEmbedding` expects a waveform tensor.
|
225 |
-
# Load and resample if necessary
|
226 |
-
voice_waveform, sr = torchaudio.load(voice_audio_path)
|
227 |
|
228 |
-
#
|
229 |
-
if
|
230 |
-
|
231 |
-
|
232 |
-
# SpeechBrain's model expects a specific sampling rate (usually 16kHz)
|
233 |
-
if sr != 16000:
|
234 |
-
resampler = torchaudio.transforms.Resample(sr, 16000)
|
235 |
-
voice_waveform = resampler(voice_waveform)
|
236 |
|
237 |
-
#
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
# Synthesize speech using SpeechT5
|
244 |
-
inputs = processor(text=target_text, return_tensors="pt")
|
245 |
|
246 |
-
#
|
247 |
-
|
248 |
-
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
249 |
-
model.to("cuda")
|
250 |
-
vocoder.to("cuda")
|
251 |
-
speaker_embeddings = speaker_embeddings.to("cuda")
|
252 |
-
|
253 |
-
print("Generating speech...")
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
# Generate speech
|
258 |
-
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
259 |
-
|
260 |
-
# Normalize to be within [-1, 1] for audio playback
|
261 |
-
speech = speech.cpu().numpy()
|
262 |
-
max_val = max(abs(speech.min()), abs(speech.max()))
|
263 |
-
if max_val > 1.0:
|
264 |
-
speech = speech / max_val
|
265 |
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
output_path = "cloned_voice_output.wav"
|
270 |
-
sf.write(output_path, speech, vocoder.config.sampling_rate)
|
271 |
|
272 |
-
return output_path
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
gr.
|
287 |
-
|
288 |
-
outputs=[
|
289 |
-
gr.Audio(label="صدای شبیه سازی شده"),
|
290 |
-
gr.Textbox(label="وضعیت")
|
291 |
-
],
|
292 |
-
title="Voice Cloner (تقلید صدا) با HuggingFace",
|
293 |
-
description="فایل صوتی اول را آپلود کنید تا متن آن استخراج شود. فایل صوتی دوم را آپلود کنید تا صدای آن تقلید شود و متن فایل اول با صدای فایل دوم تولید شود. (پشتیبانی از mp3/wav)",
|
294 |
-
examples=[
|
295 |
-
[
|
296 |
-
"audio_examples/example_content.wav", # Example for content (what to say)
|
297 |
-
"audio_examples/example_voice.wav" # Example for voice (how to say it)
|
298 |
-
]
|
299 |
-
]
|
300 |
-
)
|
301 |
-
|
302 |
-
if __name__ == "__main__":
|
303 |
-
# Create an example directory and dummy files if they don't exist
|
304 |
-
os.makedirs("audio_examples", exist_ok=True)
|
305 |
-
if not os.path.exists("audio_examples/example_content.wav"):
|
306 |
-
# Create a dummy WAV file for content
|
307 |
-
import numpy as np
|
308 |
-
samplerate = 16000
|
309 |
-
duration = 2.0 # seconds
|
310 |
-
frequency = 440 # Hz
|
311 |
-
t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
|
312 |
-
sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
|
313 |
-
sf.write("audio_examples/example_content.wav", sine_wave.astype(np.float32), samplerate)
|
314 |
-
print("Created dummy audio_examples/example_content.wav")
|
315 |
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
-
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
2 |
import torchaudio
|
3 |
+
from speechbrain.pretrained import EncoderClassifier
|
4 |
+
from speechbrain.pretrained import HIFIGAN
|
5 |
+
from speechbrain.pretrained import EncoderClassifier
|
6 |
+
import torch
|
7 |
+
import tempfile
|
8 |
import os
|
9 |
+
from pathlib import Path
|
10 |
|
11 |
+
# Initialize models
|
12 |
+
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
|
13 |
+
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
def extract_speaker_embedding(audio_file):
|
16 |
+
"""Extract speaker embedding from audio file"""
|
17 |
+
signal, fs = torchaudio.load(audio_file)
|
18 |
+
|
19 |
+
# Resample if needed
|
20 |
+
if fs != 16000:
|
21 |
+
resampler = torchaudio.transforms.Resample(fs, 16000)
|
22 |
+
signal = resampler(signal)
|
23 |
+
fs = 16000
|
24 |
+
|
25 |
+
# Handle stereo audio
|
26 |
+
if signal.shape[0] > 1:
|
27 |
+
signal = torch.mean(signal, dim=0, keepdim=True)
|
28 |
+
|
29 |
+
embeddings = classifier.encode_batch(signal)
|
30 |
+
return embeddings.squeeze(0)
|
31 |
+
|
32 |
+
def voice_conversion(source_audio, target_audio):
|
33 |
+
"""Convert source voice to sound like target voice"""
|
34 |
+
# Create temp files
|
35 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as source_tmp, \
|
36 |
+
tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as target_tmp:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
source_path = source_tmp.name
|
39 |
+
target_path = target_tmp.name
|
40 |
+
|
41 |
+
# Save uploaded files
|
42 |
+
source_audio.save(source_path)
|
43 |
+
target_audio.save(target_path)
|
44 |
+
|
45 |
+
try:
|
46 |
+
# Extract source audio and target speaker embedding
|
47 |
+
source_signal, source_fs = torchaudio.load(source_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
# Handle stereo audio
|
50 |
+
if source_signal.shape[0] > 1:
|
51 |
+
source_signal = torch.mean(source_signal, dim=0, keepdim=True)
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
# Resample source to 16kHz if needed
|
54 |
+
if source_fs != 16000:
|
55 |
+
resampler = torchaudio.transforms.Resample(source_fs, 16000)
|
56 |
+
source_signal = resampler(source_signal)
|
57 |
+
source_fs = 16000
|
|
|
|
|
|
|
58 |
|
59 |
+
# Extract target speaker embedding
|
60 |
+
target_emb = extract_speaker_embedding(target_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
+
# Generate converted waveform
|
63 |
+
waveform = hifi_gan.generate(source_signal, speaker_emb=target_emb)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
# Save output
|
66 |
+
output_path = os.path.join(tempfile.mkdtemp(), "output.wav")
|
67 |
+
torchaudio.save(output_path, waveform.squeeze(0).cpu(), 16000)
|
|
|
|
|
68 |
|
69 |
+
return output_path
|
70 |
+
finally:
|
71 |
+
# Clean up temp files
|
72 |
+
os.unlink(source_path)
|
73 |
+
os.unlink(target_path)
|
74 |
+
|
75 |
+
# Gradio interface
|
76 |
+
with gr.Blocks() as demo:
|
77 |
+
gr.Markdown("# 🎙️ Voice Changer")
|
78 |
+
gr.Markdown("بارگذاری فایل ��وتی اصلی و فایل صوتی هدف برای تبدیل صدای اول به سبک دوم")
|
79 |
+
|
80 |
+
with gr.Row():
|
81 |
+
with gr.Column():
|
82 |
+
source_audio = gr.Audio(label="فایل صوتی اصلی (صدا برای تبدیل)", type="filepath")
|
83 |
+
with gr.Column():
|
84 |
+
target_audio = gr.Audio(label="فایل صوتی هدف (سبک مورد نظر)", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
output_audio = gr.Audio(label="خروجی تبدیل شده", interactive=False)
|
87 |
+
|
88 |
+
convert_btn = gr.Button("تبدیل صوت")
|
89 |
+
convert_btn.click(
|
90 |
+
fn=voice_conversion,
|
91 |
+
inputs=[source_audio, target_audio],
|
92 |
+
outputs=output_audio
|
93 |
+
)
|
94 |
+
|
95 |
+
gr.Examples(
|
96 |
+
examples=[
|
97 |
+
[os.path.join(os.path.dirname(__file__), "examples/source1.wav"),
|
98 |
+
os.path.join(os.path.dirname(__file__), "examples/target1.wav")],
|
99 |
+
[os.path.join(os.path.dirname(__file__), "examples/source2.wav"),
|
100 |
+
os.path.join(os.path.dirname(__file__), "examples/target2.wav")]
|
101 |
+
],
|
102 |
+
inputs=[source_audio, target_audio],
|
103 |
+
outputs=output_audio,
|
104 |
+
fn=voice_conversion,
|
105 |
+
cache_examples=True
|
106 |
+
)
|
107 |
|
108 |
+
if __name__ == "__main__":
|
109 |
+
demo.launch()
|