hashhac commited on
Commit
e724e7e
·
1 Parent(s): 557f8a9

added template code

Browse files
Files changed (2) hide show
  1. app.py +49 -226
  2. requirements.txt +8 -5
app.py CHANGED
@@ -1,228 +1,51 @@
1
- import os
2
- import torch
3
- import numpy as np
4
- import librosa
5
- import gradio as gr
6
- import torchaudio
7
- import asyncio
8
- from gradio_webrtc import (
9
- AsyncAudioVideoStreamHandler,
10
- WebRTC,
11
- get_twilio_turn_credentials,
12
  )
13
- from pathlib import Path
14
-
15
- # Create directories
16
- os.makedirs("voice_samples", exist_ok=True)
17
-
18
- # Voice presets (simple pitch and speed modifications)
19
- VOICE_PRESETS = {
20
- "Deep Male": {"pitch_shift": -4, "speed_factor": 0.9},
21
- "Standard Male": {"pitch_shift": -2, "speed_factor": 0.95},
22
- "Standard Female": {"pitch_shift": 2, "speed_factor": 1.05},
23
- "High Female": {"pitch_shift": 4, "speed_factor": 1.1},
24
- }
25
-
26
- # Audio processing function
27
- def process_audio(waveform, sampling_rate=16000):
28
- # Convert from int16 to floating point if needed
29
- if waveform.dtype == np.int16:
30
- waveform = waveform / 32768.0
31
-
32
- # Make sure input is mono
33
- if len(waveform.shape) > 1:
34
- waveform = librosa.to_mono(waveform.T)
35
-
36
- # Resample to 16 kHz if needed
37
- if sampling_rate != 16000:
38
- waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
39
-
40
- # Limit length to avoid memory issues
41
- max_length = 16000 * 15
42
- if len(waveform) > max_length:
43
- waveform = waveform[:max_length]
44
-
45
- return waveform
46
-
47
- # Simple voice conversion using torchaudio effects
48
- def convert_voice_simple(waveform, preset):
49
- try:
50
- # Convert to tensor
51
- if not torch.is_tensor(waveform):
52
- waveform_tensor = torch.tensor(waveform).float()
53
- else:
54
- waveform_tensor = waveform
55
-
56
- # Ensure tensor is properly shaped
57
- if waveform_tensor.dim() == 1:
58
- waveform_tensor = waveform_tensor.unsqueeze(0)
59
-
60
- # Apply pitch shift
61
- pitch_shift = preset.get("pitch_shift", 0)
62
- if pitch_shift != 0:
63
- waveform_tensor = torchaudio.functional.pitch_shift(
64
- waveform_tensor,
65
- sample_rate=16000,
66
- n_steps=pitch_shift
67
- )
68
-
69
- # Apply speed change
70
- speed_factor = preset.get("speed_factor", 1.0)
71
- if speed_factor != 1.0:
72
- waveform_tensor = torchaudio.functional.speed(
73
- waveform_tensor,
74
- speed_factor
75
- )
76
-
77
- # Add some effects for more natural sound
78
- # Light reverb effect
79
- waveform_tensor = torchaudio.functional.add_reverb(
80
- waveform_tensor,
81
- sample_rate=16000,
82
- reverberance=20,
83
- room_scale=50,
84
- wet_gain=0
85
- )
86
-
87
- return waveform_tensor.squeeze().numpy()
88
-
89
- except Exception as e:
90
- print(f"Error in voice conversion: {e}")
91
- return waveform
92
-
93
- class VoiceConversionHandler(AsyncAudioVideoStreamHandler):
94
- def __init__(
95
- self, expected_layout="mono", output_sample_rate=16000, output_frame_size=1024
96
- ) -> None:
97
- super().__init__(
98
- expected_layout,
99
- output_sample_rate,
100
- output_frame_size,
101
- input_sample_rate=16000,
102
- )
103
- self.audio_queue = asyncio.Queue()
104
- self.quit = asyncio.Event()
105
- self.voice_preset = None
106
- self.buffer = np.array([])
107
- self.buffer_size = 4096 # Buffer size for processing
108
-
109
- def copy(self) -> "VoiceConversionHandler":
110
- return VoiceConversionHandler(
111
- expected_layout=self.expected_layout,
112
- output_sample_rate=self.output_sample_rate,
113
- output_frame_size=self.output_frame_size,
114
- )
115
-
116
- async def receive(self, frame: tuple[int, np.ndarray]) -> None:
117
- sample_rate, array = frame
118
- array = array.squeeze()
119
-
120
- # Add new audio to buffer
121
- self.buffer = np.append(self.buffer, process_audio(array, sample_rate))
122
-
123
- # Process when buffer is large enough
124
- if len(self.buffer) >= self.buffer_size:
125
- # Process audio chunk
126
- if self.voice_preset:
127
- preset = VOICE_PRESETS.get(self.voice_preset, VOICE_PRESETS["Standard Male"])
128
- processed_audio = convert_voice_simple(self.buffer[:self.buffer_size], preset)
129
- result = (processed_audio * 32767).astype(np.int16)
130
- else:
131
- # Return original if no voice preset is selected
132
- result = (self.buffer[:self.buffer_size] * 32767).astype(np.int16)
133
-
134
- self.audio_queue.put_nowait((16000, result))
135
- # Keep remainder
136
- self.buffer = self.buffer[self.buffer_size:]
137
-
138
- async def emit(self):
139
- if not self.args_set.is_set():
140
- await self.wait_for_args()
141
-
142
- # Get selected voice preset
143
- if self.latest_args and len(self.latest_args) > 1:
144
- self.voice_preset = self.latest_args[1]
145
-
146
- # If queue is empty, return silence
147
- if self.audio_queue.empty():
148
- return (16000, np.zeros(self.output_frame_size, dtype=np.int16))
149
-
150
- return await self.audio_queue.get()
151
-
152
- def shutdown(self) -> None:
153
- self.quit.set()
154
- self.args_set.clear()
155
- self.quit.clear()
156
-
157
- # CSS for styling
158
- css = """
159
- .container {
160
- max-width: 800px;
161
- margin: 0 auto;
162
- padding: 20px;
163
- }
164
- .header {
165
- text-align: center;
166
- margin-bottom: 20px;
167
- }
168
- .voice-controls {
169
- padding: 15px;
170
- border-radius: 8px;
171
- background-color: #f5f5f5;
172
- margin-bottom: 20px;
173
- }
174
- """
175
-
176
- # Main application
177
- def main():
178
- with gr.Blocks(css=css) as demo:
179
- gr.Markdown(
180
- """
181
- <div class="header">
182
- <h1>Real-time Voice Conversion</h1>
183
- <p>Speak into your microphone to convert your voice in real-time using audio effects.</p>
184
- </div>
185
- """
186
- )
187
 
188
- with gr.Row(equal_height=True):
189
- with gr.Column():
190
- webrtc = WebRTC(
191
- label="Voice Chat",
192
- modality="audio",
193
- mode="send-receive",
194
- rtc_configuration=get_twilio_turn_credentials(),
195
- pulse_color="rgb(35, 157, 225)",
196
- )
197
-
198
- with gr.Column(elem_classes="voice-controls"):
199
- voice_preset = gr.Radio(
200
- choices=list(VOICE_PRESETS.keys()),
201
- value="Standard Male",
202
- label="Target Voice"
203
- )
204
-
205
- gr.Markdown(
206
- """
207
- ### How to use:
208
- 1. Allow microphone access
209
- 2. Select your target voice style
210
- 3. Click the microphone button and start speaking
211
- 4. Your voice will be converted in real-time
212
-
213
- Note: This version uses basic audio effects without SentencePiece.
214
- """
215
- )
216
-
217
- webrtc.stream(
218
- VoiceConversionHandler(),
219
- inputs=[webrtc, voice_preset],
220
- outputs=[webrtc],
221
- concurrency_limit=2,
222
- )
223
-
224
- return demo
225
-
226
- if __name__ == "__main__":
227
- demo = main()
228
- demo.launch()
 
1
+ from fastrtc import (
2
+ ReplyOnPause, AdditionalOutputs, Stream,
3
+ audio_to_bytes, aggregate_bytes_to_16bit
 
 
 
 
 
 
 
 
4
  )
5
+ import gradio as gr
6
+ from groq import Groq
7
+ import numpy as np
8
+ import anthropic
9
+ from elevenlabs import ElevenLabs
10
+
11
+ groq_client = Groq()
12
+ claude_client = anthropic.Anthropic()
13
+ tts_client = ElevenLabs()
14
+
15
+
16
+ # See "Talk to Claude" in Cookbook for an example of how to keep
17
+ # track of the chat history.
18
+ def response(
19
+ audio: tuple[int, np.ndarray],
20
+ ):
21
+ prompt = groq_client.audio.transcriptions.create(
22
+ file=("audio-file.mp3", audio_to_bytes(audio)),
23
+ model="whisper-large-v3-turbo",
24
+ response_format="verbose_json",
25
+ ).text
26
+ response = claude_client.messages.create(
27
+ model="claude-3-5-haiku-20241022",
28
+ max_tokens=512,
29
+ messages=[{"role": "user", "content": prompt}],
30
+ )
31
+ response_text = " ".join(
32
+ block.text
33
+ for block in response.content
34
+ if getattr(block, "type", None) == "text"
35
+ )
36
+ iterator = tts_client.text_to_speech.convert_as_stream(
37
+ text=response_text,
38
+ voice_id="JBFqnCBsd6RMkjVDRZzb",
39
+ model_id="eleven_multilingual_v2",
40
+ output_format="pcm_24000"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ )
43
+ for chunk in aggregate_bytes_to_16bit(iterator):
44
+ audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
45
+ yield (24000, audio_array)
46
+
47
+ stream = Stream(
48
+ modality="audio",
49
+ mode="send-receive",
50
+ handler=ReplyOnPause(response),
51
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -3,11 +3,14 @@ uvicorn
3
  transformers
4
  torch
5
  numpy
6
- librosa
7
  python-dotenv
8
  fastrtc[vad, tts]
9
- SentencePiece
10
- gradio_webrtc
11
- twilio
12
  gradio
13
- torchaudio
 
 
 
 
 
3
  transformers
4
  torch
5
  numpy
6
+ # librosa
7
  python-dotenv
8
  fastrtc[vad, tts]
9
+ # SentencePiece
10
+ # twilio
 
11
  gradio
12
+ # torchaudio
13
+ elevenlabs
14
+ groq
15
+ anthropic
16
+ ffmpeg