hashhac commited on
Commit
a958ea7
·
1 Parent(s): 4f31105
app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import librosa
5
+ import gradio as gr
6
+ import torchaudio
7
+ import asyncio
8
+ from gradio_webrtc import (
9
+ AsyncAudioVideoStreamHandler,
10
+ WebRTC,
11
+ get_twilio_turn_credentials,
12
+ )
13
+ from pathlib import Path
14
+
15
+ # Create directories
16
+ os.makedirs("voice_samples", exist_ok=True)
17
+
18
+ # Voice presets (simple pitch and speed modifications)
19
+ VOICE_PRESETS = {
20
+ "Deep Male": {"pitch_shift": -4, "speed_factor": 0.9},
21
+ "Standard Male": {"pitch_shift": -2, "speed_factor": 0.95},
22
+ "Standard Female": {"pitch_shift": 2, "speed_factor": 1.05},
23
+ "High Female": {"pitch_shift": 4, "speed_factor": 1.1},
24
+ }
25
+
26
+ # Audio processing function
27
+ def process_audio(waveform, sampling_rate=16000):
28
+ # Convert from int16 to floating point if needed
29
+ if waveform.dtype == np.int16:
30
+ waveform = waveform / 32768.0
31
+
32
+ # Make sure input is mono
33
+ if len(waveform.shape) > 1:
34
+ waveform = librosa.to_mono(waveform.T)
35
+
36
+ # Resample to 16 kHz if needed
37
+ if sampling_rate != 16000:
38
+ waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
39
+
40
+ # Limit length to avoid memory issues
41
+ max_length = 16000 * 15
42
+ if len(waveform) > max_length:
43
+ waveform = waveform[:max_length]
44
+
45
+ return waveform
46
+
47
+ # Simple voice conversion using torchaudio effects
48
+ def convert_voice_simple(waveform, preset):
49
+ try:
50
+ # Convert to tensor
51
+ if not torch.is_tensor(waveform):
52
+ waveform_tensor = torch.tensor(waveform).float()
53
+ else:
54
+ waveform_tensor = waveform
55
+
56
+ # Ensure tensor is properly shaped
57
+ if waveform_tensor.dim() == 1:
58
+ waveform_tensor = waveform_tensor.unsqueeze(0)
59
+
60
+ # Apply pitch shift
61
+ pitch_shift = preset.get("pitch_shift", 0)
62
+ if pitch_shift != 0:
63
+ waveform_tensor = torchaudio.functional.pitch_shift(
64
+ waveform_tensor,
65
+ sample_rate=16000,
66
+ n_steps=pitch_shift
67
+ )
68
+
69
+ # Apply speed change
70
+ speed_factor = preset.get("speed_factor", 1.0)
71
+ if speed_factor != 1.0:
72
+ waveform_tensor = torchaudio.functional.speed(
73
+ waveform_tensor,
74
+ speed_factor
75
+ )
76
+
77
+ # Add some effects for more natural sound
78
+ # Light reverb effect
79
+ waveform_tensor = torchaudio.functional.add_reverb(
80
+ waveform_tensor,
81
+ sample_rate=16000,
82
+ reverberance=20,
83
+ room_scale=50,
84
+ wet_gain=0
85
+ )
86
+
87
+ return waveform_tensor.squeeze().numpy()
88
+
89
+ except Exception as e:
90
+ print(f"Error in voice conversion: {e}")
91
+ return waveform
92
+
93
+ class VoiceConversionHandler(AsyncAudioVideoStreamHandler):
94
+ def __init__(
95
+ self, expected_layout="mono", output_sample_rate=16000, output_frame_size=1024
96
+ ) -> None:
97
+ super().__init__(
98
+ expected_layout,
99
+ output_sample_rate,
100
+ output_frame_size,
101
+ input_sample_rate=16000,
102
+ )
103
+ self.audio_queue = asyncio.Queue()
104
+ self.quit = asyncio.Event()
105
+ self.voice_preset = None
106
+ self.buffer = np.array([])
107
+ self.buffer_size = 4096 # Buffer size for processing
108
+
109
+ def copy(self) -> "VoiceConversionHandler":
110
+ return VoiceConversionHandler(
111
+ expected_layout=self.expected_layout,
112
+ output_sample_rate=self.output_sample_rate,
113
+ output_frame_size=self.output_frame_size,
114
+ )
115
+
116
+ async def receive(self, frame: tuple[int, np.ndarray]) -> None:
117
+ sample_rate, array = frame
118
+ array = array.squeeze()
119
+
120
+ # Add new audio to buffer
121
+ self.buffer = np.append(self.buffer, process_audio(array, sample_rate))
122
+
123
+ # Process when buffer is large enough
124
+ if len(self.buffer) >= self.buffer_size:
125
+ # Process audio chunk
126
+ if self.voice_preset:
127
+ preset = VOICE_PRESETS.get(self.voice_preset, VOICE_PRESETS["Standard Male"])
128
+ processed_audio = convert_voice_simple(self.buffer[:self.buffer_size], preset)
129
+ result = (processed_audio * 32767).astype(np.int16)
130
+ else:
131
+ # Return original if no voice preset is selected
132
+ result = (self.buffer[:self.buffer_size] * 32767).astype(np.int16)
133
+
134
+ self.audio_queue.put_nowait((16000, result))
135
+ # Keep remainder
136
+ self.buffer = self.buffer[self.buffer_size:]
137
+
138
+ async def emit(self):
139
+ if not self.args_set.is_set():
140
+ await self.wait_for_args()
141
+
142
+ # Get selected voice preset
143
+ if self.latest_args and len(self.latest_args) > 1:
144
+ self.voice_preset = self.latest_args[1]
145
+
146
+ # If queue is empty, return silence
147
+ if self.audio_queue.empty():
148
+ return (16000, np.zeros(self.output_frame_size, dtype=np.int16))
149
+
150
+ return await self.audio_queue.get()
151
+
152
+ def shutdown(self) -> None:
153
+ self.quit.set()
154
+ self.args_set.clear()
155
+ self.quit.clear()
156
+
157
+ # CSS for styling
158
+ css = """
159
+ .container {
160
+ max-width: 800px;
161
+ margin: 0 auto;
162
+ padding: 20px;
163
+ }
164
+ .header {
165
+ text-align: center;
166
+ margin-bottom: 20px;
167
+ }
168
+ .voice-controls {
169
+ padding: 15px;
170
+ border-radius: 8px;
171
+ background-color: #f5f5f5;
172
+ margin-bottom: 20px;
173
+ }
174
+ """
175
+
176
+ # Main application
177
+ def main():
178
+ with gr.Blocks(css=css) as demo:
179
+ gr.Markdown(
180
+ """
181
+ <div class="header">
182
+ <h1>Real-time Voice Conversion</h1>
183
+ <p>Speak into your microphone to convert your voice in real-time using audio effects.</p>
184
+ </div>
185
+ """
186
+ )
187
+
188
+ with gr.Row(equal_height=True):
189
+ with gr.Column():
190
+ webrtc = WebRTC(
191
+ label="Voice Chat",
192
+ modality="audio",
193
+ mode="send-receive",
194
+ rtc_configuration=get_twilio_turn_credentials(),
195
+ pulse_color="rgb(35, 157, 225)",
196
+ )
197
+
198
+ with gr.Column(elem_classes="voice-controls"):
199
+ voice_preset = gr.Radio(
200
+ choices=list(VOICE_PRESETS.keys()),
201
+ value="Standard Male",
202
+ label="Target Voice"
203
+ )
204
+
205
+ gr.Markdown(
206
+ """
207
+ ### How to use:
208
+ 1. Allow microphone access
209
+ 2. Select your target voice style
210
+ 3. Click the microphone button and start speaking
211
+ 4. Your voice will be converted in real-time
212
+
213
+ Note: This version uses basic audio effects without SentencePiece.
214
+ """
215
+ )
216
+
217
+ webrtc.stream(
218
+ VoiceConversionHandler(),
219
+ inputs=[webrtc, voice_preset],
220
+ outputs=[webrtc],
221
+ concurrency_limit=2,
222
+ )
223
+
224
+ return demo
225
+
226
+ if __name__ == "__main__":
227
+ demo = main()
228
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ torch
5
+ numpy
6
+ librosa
7
+ python-dotenv
8
+ fastrtc[vad, tts]
9
+ SentencePiece
10
+ gradio_webrtc
11
+ twilio
12
+ gradio
spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf67b36c47edfb1851466a1dff081b436bc6809b5ebc12811d9df0c0d0f28d0e
3
+ size 2176
spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f71ffadda3f3a4de079740a0b34963824dc644d9d5442283bd0a2b0d4f44ff0b
3
+ size 2176