suprimedev commited on
Commit
ec585dc
·
verified ·
1 Parent(s): a1242a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +316 -80
app.py CHANGED
@@ -1,100 +1,336 @@
1
  import gradio as gr
 
 
2
  import torch
3
  import torchaudio
4
- from torchaudio.transforms import Resample
5
- import numpy as np
6
- import os
7
- from datetime import datetime
8
  import soundfile as sf
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # بررسی وجود GPU
11
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
12
 
13
- def load_audio(file_path, target_sr=16000):
14
- """بارگذاری فایل صوتی و تبدیل به تنسور"""
 
 
 
 
 
 
 
 
15
  try:
16
- waveform, sample_rate = torchaudio.load(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # تغییر نرخ نمونه‌برداری در صورت نیاز
19
- if sample_rate != target_sr:
20
- resampler = Resample(sample_rate, target_sr)
21
- waveform = resampler(waveform)
22
-
23
- return waveform.to(device), target_sr
24
- except Exception as e:
25
- raise gr.Error(f"خطا در بارگذاری فایل صوتی: {str(e)}")
26
 
27
- def preprocess_audio(waveform, sr):
28
- """پیش‌پردازش سیگنال صوتی"""
29
- # نرمالایز کردن سیگنال
30
- waveform = waveform / torch.max(torch.abs(waveform))
31
-
32
- # تبدیل به مونو اگر استریو باشد
33
- if waveform.shape[0] > 1:
34
- waveform = torch.mean(waveform, dim=0, keepdim=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- return waveform
 
 
 
 
 
 
 
37
 
38
- def clone_voice(source_audio, target_audio, output_format="wav"):
39
- """عمل کلون کردن صدا"""
40
- try:
41
- # بارگذاری فایل‌های صوتی
42
- source_waveform, source_sr = load_audio(source_audio)
43
- target_waveform, target_sr = load_audio(target_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- # پیش‌پردازش
46
- source_waveform = preprocess_audio(source_waveform, source_sr)
47
- target_waveform = preprocess_audio(target_waveform, target_sr)
 
 
 
 
 
48
 
49
- # در اینجا باید مدل تبدیل صدا اعمال شود
50
- # این قسمت ساده‌سازی شده و نیاز به پیاده‌سازی واقعی دارد
51
- # برای نمونه، فقط طول موج هدف را با طول موج منبع هماهنگ می‌کنیم
52
- min_len = min(source_waveform.shape[1], target_waveform.shape[1])
53
- converted_waveform = target_waveform[:, :min_len]
 
 
 
54
 
55
- # ذخیره فایل نتیجه
56
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
57
- output_filename = f"output_{timestamp}.{output_format.lower()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- # ذخیره با soundfile که با فرمت‌های مختلف کار می‌کند
60
- sf.write(output_filename, converted_waveform.cpu().numpy().T, target_sr)
 
 
 
61
 
62
- return output_filename
 
63
  except Exception as e:
64
- raise gr.Error(f"خطا در پردازش صدا: {str(e)}")
 
 
 
 
65
 
66
- # رابط Gradio
67
- with gr.Blocks(title="Voice Cloner") as demo:
68
- gr.Markdown("# 🎤 Voice Cloner")
69
- gr.Markdown("بارگذاری فایل صوتی منبع و فایل صوتی هدف برای کلون کردن صدا")
70
-
71
- with gr.Row():
72
- with gr.Column():
73
- source_audio = gr.Audio(label="فایل صوتی منبع (صدا برای کپی کردن)", type="filepath")
74
- target_audio = gr.Audio(label="فایل صوتی هدف (محتوا برای تبدیل)", type="filepath")
75
- output_format = gr.Dropdown(
76
- choices=["wav", "mp3"],
77
- value="wav",
78
- label="فرمت فایل خروجی",
79
- interactive=True
80
- )
81
- submit_btn = gr.Button("شروع تبدیل صدا")
82
-
83
- with gr.Column():
84
- output_audio = gr.Audio(label="فایل صوتی نتیجه", interactive=False)
85
- download_link = gr.File(label="دانلود فایل نتیجه")
86
-
87
- submit_btn.click(
88
- fn=clone_voice,
89
- inputs=[source_audio, target_audio, output_format],
90
- outputs=[output_audio]
91
- )
92
-
93
- output_audio.change(
94
- lambda x: gr.File(value=x),
95
- inputs=[output_audio],
96
- outputs=[download_link]
97
- )
98
 
99
  if __name__ == "__main__":
100
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
+ from datasets import load_dataset
4
  import torch
5
  import torchaudio
 
 
 
 
6
  import soundfile as sf
7
+ import os
8
+
9
+ # Load models
10
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
+
14
+ # Load speaker embeddings (we'll use a sample speaker for simplicity)
15
+ # For real voice cloning, you'd extract embeddings from your target audio.
16
+ # For a quick demo, we'll use a pre-defined one from a dataset.
17
+ # This is just an example, a robust voice cloner would extract embeddings directly.
18
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
19
 
20
+ # You need to choose an appropriate speaker embedding.
21
+ # For a general solution, you'd typically process an *encoding* of the target voice.
22
+ # Let's take a sample speaker embedding. In a real application,
23
+ # you'd extract this from the second audio file.
24
+ # As a placeholder, we'll use a pre-existing one.
25
+ # For true voice cloning with the second audio, you'd need a model
26
+ # that can extract speaker embeddings from arbitrary audio.
27
+ # SpeechT5 itself doesn't directly provide a pre-trained model for this from an arbitrary audio.
28
+ # Typically, you'd use a separate speaker embedding extraction model (e.g., from ECAPA-TDNN).
29
+ # For this example, let's use a placeholder.
30
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
31
 
32
+
33
+ def voice_clone(text_audio_path, voice_audio_path):
34
+ """
35
+ Clones the voice from voice_audio_path to speak the content of text_audio_path.
36
+ (Note: This implementation primarily uses text from text_audio_path and applies a generic voice.
37
+ True "cloning" from the *content* of text_audio_path while applying the *style*
38
+ of voice_audio_path for arbitrary content is more advanced.
39
+ Here, we assume text_audio_path provides the text to be spoken,
40
+ and voice_audio_path provides the *target voice characteristics*.)
41
+ """
42
  try:
43
+ # 1. Read the audio file where we want to extract the content (text or speech)
44
+ # For simplicity, we'll assume the text_audio_path contains speech that we want to transcribe
45
+ # and then re-synthesize in the style of voice_audio_path.
46
+ # NOTE: SpeechT5 is primarily Text-to-Speech. To get text from text_audio_path,
47
+ # you would need an ASR (Automatic Speech Recognition) model.
48
+ # For a simpler demo, let's assume text_audio_path *could be transcribed*
49
+ # or directly provides the text.
50
+
51
+ # For this example, let's assume `text_audio_path` is the source of the *text*
52
+ # and `voice_audio_path` is the source of the *voice characteristics*.
53
+
54
+ # Step 1: Read the audio file to get the content/text (A real implementation would need ASR here)
55
+ # As a placeholder, let's just make up some text or simplify.
56
+ # A more robust solution would involve:
57
+ # a) ASR on `text_audio_path` to get the text.
58
+ # b) Feature extraction from `voice_audio_path` to get an accurate speaker embedding.
59
+
60
+ # For now, let's simplify and make a strong assumption:
61
+ # The user provides a text content *implicitly* via the first audio.
62
+ # For ASR, we'd need another model. Let's make an assumption for the demo:
63
+ # we will use a fixed text, and apply the speaker embedding from `voice_audio_path`.
64
+ # THIS IS NOT TRUE VOICE CLONING OF *CONTENT* + *VOICE*.
65
+ # It's more Text-to-Speech with a specific speaker.
66
+
67
+ # Let's say, for demonstration, we will let the user type the text,
68
+ # OR pretend to extract text from `text_audio_path` and use a generic speaker embedding
69
+ # if we cannot extract accurate speaker embeddings from arbitrary audio using SpeechT5 directly.
70
+
71
+ # A more practical approach for your request:
72
+ # 1. User provides "source audio" (audio_1). We want to extract *content/text* from it.
73
+ # 2. User provides "target voice audio" (audio_2). We want to extract *speaker identity* from it.
74
+ # 3. Synthesize the extracted text with the extracted speaker identity.
75
+
76
+ # SpeechT5 doesn't have a direct "extract speaker embedding from arbitrary audio" function
77
+ # built into its `SpeechT5Processor`. You'd typically use a separate model for that (e.g., ECAPA-TDNN).
78
+ # For the sake of this demo with SpeechT5, we will use a pre-extracted speaker embedding.
79
+ # This means true "cloning the voice *from* the second file" (if that file is arbitrary)
80
+ # is complex.
81
+
82
+ # Let's refine the approach for a Hugging Face model combo:
83
+ # - Use a robust ASR model (e.g., Whisper) for the first audio to get text.
84
+ # - Use a speaker embedding model (e.g., from pyannote.audio or similar) for the second audio.
85
+ # - Use SpeechT5 for TTS.
86
+
87
+ # Given the prompt, "تقلید صدای فایل اول را از فایل دوم", it implies:
88
+ # File 1: Provides the *content* (what is said).
89
+ # File 2: Provides the *voice style/identity*.
90
+
91
+ # For a simplified demo *without* an ASR model and a dedicated speaker embedding extractor:
92
+ # We will prompt the user for the text they want to say.
93
+ # The second audio will be *assumed* to be able to provide a speaker embedding.
94
+
95
+ # For a robust solution, you'd add:
96
+ # from transformers import pipeline
97
+ # asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
98
+ # target_text = asr_pipeline(text_audio_path)["text"]
99
+
100
+ # And for speaker embedding extraction from voice_audio_path:
101
+ # This is the trickiest part with SpeechT5 directly. It expects a pre-computed xvector.
102
+ # You could use a library like `pyannote.audio` to get xvectors:
103
+ # from pyannote.audio import Inference
104
+ # speaker_diarization_model = Inference("pyannote/[email protected]", device=0)
105
+ # embeddings = speaker_diarization_model.get_embedding(voice_audio_path)
106
+ # speaker_embeddings = torch.mean(embeddings, dim=0).unsqueeze(0) # or careful selection
107
+
108
+ # Let's modify the Gradio interface to take TEXT input AND two audios
109
+ # with clearer roles, or make a very strong assumption.
110
+
111
+ # For a demonstration, let's assume `voice_audio_path` provides speaker characteristics,
112
+ # AND we will use a generic default text, OR we ask for text explicitly.
113
+
114
+ # Let's try to extract speaker embeddings using a more general method if possible,
115
+ # but this is beyond SpeechT5's direct scope.
116
+ # For demonstration, we will use a pre-defined speaker embedding.
117
+ # To truly take it from `voice_audio_path`, you'd need an `inference` model
118
+ # that generates these embeddings from arbitrary audio which is not part of `SpeechT5ForTextToSpeech`.
119
 
120
+ # If you want to use the *content* of `text_audio_path` AND the *voice* of `voice_audio_path`:
121
+ # The current SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
122
+ # So, we'll take a simplified approach for the demo:
123
+ # 1. We will use a *fixed text* (e.g., "Hello, this is a test of voice cloning.")
124
+ # 2. We will attempt to use a speaker embedding. Since SpeechT5 needs a pre-computed x-vector,
125
+ # and we can't easily extract it from an arbitrary `voice_audio_path` *within this simple setup*,
126
+ # we will stick to the generic `speaker_embeddings` loaded earlier.
127
+ # This means true "voice cloning based on the second input audio" is limited by the model's structure.
128
 
129
+ # Let's reconsider the prompt: "تقلید صدای فایل اول را از فایل دوم".
130
+ # This implies:
131
+ # - File 1: The *source content* (what to say).
132
+ # - File 2: The *target voice* (how to say it).
133
+
134
+ # To do this properly, we need:
135
+ # 1. ASR model to transcribe File 1.
136
+ # 2. Speaker embedding model to extract embeddings from File 2.
137
+ # 3. SpeechT5 to synthesize the text from 1 with embeddings from 2.
138
+
139
+ # Let's use `pyaudio` for reading and `transformers` for ASR.
140
+ # This will significantly increase the `requirements.txt`.
141
+
142
+ # New plan:
143
+ # Assume `text_audio_path` (File 1) is where we get the *text to speak*.
144
+ # Assume `voice_audio_path` (File 2) is where we get the *speaker's voice characteristics*.
145
+ # We need an ASR for File 1 and a speaker embedding extractor for File 2.
146
+
147
+ # Let's install Whisper for ASR.
148
+ # pip install git+https://github.com/huggingface/transformers.git openai-whisper optimum accelerate
149
+
150
+ from transformers import pipeline
151
 
152
+ # Initialize ASR pipeline
153
+ asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=0 if torch.cuda.is_available() else -1)
154
+
155
+ # Transcribe the content from the first audio file
156
+ print(f"Transcribing {text_audio_path}...")
157
+ transcription_result = asr_pipeline(text_audio_path)
158
+ target_text = transcription_result["text"]
159
+ print(f"Transcribed text: {target_text}")
160
 
161
+ if not target_text:
162
+ return None, "No discernible text extracted from the first audio file. Please try a clearer audio."
163
+
164
+ # Extract speaker embeddings from the second audio file
165
+ # This is the most challenging part with SpeechT5 directly.
166
+ # Although SpeechT5 uses x-vectors, it doesn't provide a direct inference method
167
+ # to extract them from arbitrary audio files for new speakers.
168
+ # The 'speaker_embeddings' used in examples are usually pre-extracted or come from a dataset.
169
+ # For a robust solution, you'd need a separate model like pyannote.audio's speaker embedding model.
170
+ # For this demo, let's use a simpler approach:
171
+ # We will try to use the `voice_audio_path` to generate a "speaker embedding"
172
+ # but it won't be as precise as a dedicated model. This is where the
173
+ # "low error" might be compromised if the speaker embedding isn't accurate.
174
+
175
+ # For a proper solution, you'd need something like:
176
+ # from pyannote.audio import Inference as SpeakerInference
177
+ # speaker_embedding_model = SpeakerInference("pyannote/embedding", device=0)
178
+ # waveform, sample_rate = torchaudio.load(voice_audio_path)
179
+ # # Resample if necessary for the speaker embedding model
180
+ # if sample_rate != speaker_embedding_model.sample_rate:
181
+ # resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=speaker_embedding_model.sample_rate)
182
+ # waveform = resampler(waveform)
183
+ # speaker_embeddings_from_file = speaker_embedding_model(waveform.unsqueeze(0)).squeeze(0) # Assuming single speaker
184
+ # speaker_embeddings = speaker_embeddings_from_file.unsqueeze(0)
185
+
186
+ # Given the constraint to keep it simpler with available HuggingFace models (SpeechT5, Whisper),
187
+ # let's acknowledge the limitation: directly extracting robust speaker embeddings from *arbitrary* audio
188
+ # for SpeechT5 is not straightforward without more models.
189
+ # We will use the *pre-defined* `speaker_embeddings` for the demo, which means
190
+ # the "voice cloning" from `voice_audio_path` is highly simplified/simulated using a generic voice.
191
+
192
+ # If `speaker_embeddings` were accurately derived from `voice_audio_path`, this would be true cloning.
193
+ # For a basic demo, let's proceed with the generic speaker embedding to show the TTS part.
194
+ # To truly use the voice from the second file, we need a way to extract x-vectors.
195
+ # FOR THIS DEMO, WE WILL USE THE PREDEFINED SPEAKER EMBEDDING.
196
+ # THE SECOND AUDIO FILE IS CURRENTLY NOT USED FOR SPEAKER EMBEDDING EXTRACTION.
197
+ # This is a critical limitation for "voice cloning" *from* an arbitrary file.
198
+
199
+ # If you specifically want to use the voice of the *second* file,
200
+ # the recommended approach would be to extract x-vectors using a separate
201
+ # model (e.g., from `pyannote.audio` or `speechbrain`).
202
+ # Since `pyannote.audio` might add complexity to `requirements.txt`
203
+ # and device handling, let's keep it with what's easily integrated by `transformers`.
204
+
205
+ # For an actual voice cloning, the second audio file provides the speaker's timbre.
206
+ # SpeechT5 accepts `speaker_embeddings`. How to get these from an arbitrary MP3?
207
+ # This is the core problem. The models `google/speakertype` or `s3prl/pretrain_ecapa_tdnn`
208
+ # can extract these. If we add `speechbrain` to requirements, we can do it.
209
+
210
+ # Let's add `speechbrain` for speaker embedding extraction.
211
+ # pip install speechbrain
212
+
213
+ from speechbrain.inference.SpeakerEmbedding import SpeakerEmbedding as SpeechBrainSpeakerEmbedding
214
+
215
+ # Initialize Speaker Embedding Model
216
+ speaker_embedding_model_sb = SpeechBrainSpeakerEmbedding.from_hparams(
217
+ source="speechbrain/spkrec-ecapa-tdnn",
218
+ savedir="pretrained_models/spkrec-ecapa-tdnn",
219
+ run_opts={"device":"cuda" if torch.cuda.is_available() else "cpu"}
220
+ )
221
+
222
+ print(f"Extracting speaker embedding from {voice_audio_path}...")
223
+ # Load the second audio file for speaker embedding
224
+ # SpeechBrain's `SpeakerEmbedding` expects a waveform tensor.
225
+ # Load and resample if necessary
226
+ voice_waveform, sr = torchaudio.load(voice_audio_path)
227
 
228
+ # Need to ensure correct sampling rate and mono channel for `speechbrain`
229
+ if voice_waveform.shape[0] > 1: # Convert to mono
230
+ voice_waveform = voice_waveform.mean(dim=0, keepdim=True)
231
+
232
+ # SpeechBrain's model expects a specific sampling rate (usually 16kHz)
233
+ if sr != 16000:
234
+ resampler = torchaudio.transforms.Resample(sr, 16000)
235
+ voice_waveform = resampler(voice_waveform)
236
 
237
+ # Extract the speaker embedding
238
+ speaker_embeddings_from_voice_audio = speaker_embedding_model_sb.encode_batch(voice_waveform).squeeze(0)
239
+ # SpeechT5 expects embeddings with shape (1, 512) for a single speaker
240
+ speaker_embeddings = speaker_embeddings_from_voice_audio.unsqueeze(0)
241
+ print("Speaker embedding extracted.")
242
+
243
+ # Synthesize speech using SpeechT5
244
+ inputs = processor(text=target_text, return_tensors="pt")
245
 
246
+ # Move inputs and speaker_embeddings to the same device as the model
247
+ if torch.cuda.is_available():
248
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
249
+ model.to("cuda")
250
+ vocoder.to("cuda")
251
+ speaker_embeddings = speaker_embeddings.to("cuda")
252
+
253
+ print("Generating speech...")
254
+ 🐸 SpeechT5 doesn't provide a direct way to extract x-vectors from an arbitrary audio file.
255
+ The 'speaker_embeddings' in examples are usually pre_extracted or comes from dataset
256
+ FOR A ROBUST SOLUTION using custom audio, you need a separate model like pyannote.audio or SpeechBrain
257
+ This part of the code assumes that you have already extracted *speaker_embeddings* (x-vector) from the second audio file,
258
+ which contains the voice you want to clone. If not, it will use a a generic pre-defined embedding or raise error.
259
+ This is the trickiest part for direct voice cloning with arbitrary audio using SpeechT5.
260
+
261
+ For this demo, we'll implement both:
262
+ 1. Basic version with predefined speaker embedding (simpler, less true cloning).
263
+ 2. Advanced version with SpeechBrain for speaker embedding extraction (more accurate cloning).
264
+ Let's go with the advanced version to meet the "low error" requirement for cloning.
265
+ ```
266
+
267
+ # Generate speech
268
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
269
+
270
+ # Normalize to be within [-1, 1] for audio playback
271
+ speech = speech.cpu().numpy()
272
+ max_val = max(abs(speech.min()), abs(speech.max()))
273
+ if max_val > 1.0:
274
+ speech = speech / max_val
275
 
276
+ print("Speech generated. Saving to temporary file...")
277
+
278
+ # Save the generated audio to a temporary file
279
+ output_path = "cloned_voice_output.wav"
280
+ sf.write(output_path, speech, vocoder.config.sampling_rate)
281
 
282
+ return output_path, "Voice cloning successful!"
283
+
284
  except Exception as e:
285
+ error_message = f"An error occurred: {e}"
286
+ print(error_message)
287
+ import traceback
288
+ traceback.print_exc()
289
+ return None, error_message
290
 
291
+ # Gradio Interface
292
+ iface = gr.Interface(
293
+ fn=voice_clone,
294
+ inputs=[
295
+ gr.Audio(type="filepath", label="فایل صوتی اول (برای استخراج متن)", source="upload"),
296
+ gr.Audio(type="filepath", label="فایل صوتی دوم (برای تقلید صدا)", source="upload")
297
+ ],
298
+ outputs=[
299
+ gr.Audio(label="صدای شبیه سازی شده"),
300
+ gr.Textbox(label="وضعیت")
301
+ ],
302
+ title="Voice Cloner (تقلید صدا) با HuggingFace",
303
+ description="فایل صوتی اول را آپلود کنید تا متن آن استخراج شود. فایل صوتی دوم را آپلود کنید تا صدای آن تقلید شود و متن فایل اول با صدای فایل دوم تولید شود. (پشتیبانی از mp3/wav)",
304
+ examples=[
305
+ [
306
+ "audio_examples/example_content.wav", # Example for content (what to say)
307
+ "audio_examples/example_voice.wav" # Example for voice (how to say it)
308
+ ]
309
+ ]
310
+ )
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  if __name__ == "__main__":
313
+ # Create an example directory and dummy files if they don't exist
314
+ os.makedirs("audio_examples", exist_ok=True)
315
+ if not os.path.exists("audio_examples/example_content.wav"):
316
+ # Create a dummy WAV file for content
317
+ import numpy as np
318
+ samplerate = 16000
319
+ duration = 2.0 # seconds
320
+ frequency = 440 # Hz
321
+ t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
322
+ sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
323
+ sf.write("audio_examples/example_content.wav", sine_wave.astype(np.float32), samplerate)
324
+ print("Created dummy audio_examples/example_content.wav")
325
+
326
+ if not os.path.exists("audio_examples/example_voice.wav"):
327
+ # Create another dummy WAV file for voice
328
+ samplerate = 16000
329
+ duration = 1.5 # seconds
330
+ frequency = 880 # Hz
331
+ t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
332
+ sine_wave = 0.5 * np.sin(2 * np.pi * frequency * t)
333
+ sf.write("audio_examples/example_voice.wav", sine_wave.astype(np.float32), samplerate)
334
+ print("Created dummy audio_examples/example_voice.wav")
335
+
336
+ iface.launch()