Athspi commited on
Commit
170f3d8
·
verified ·
1 Parent(s): 5c157a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -36
app.py CHANGED
@@ -4,23 +4,62 @@ import numpy as np
4
  import gradio as gr
5
  import librosa
6
  import soundfile as sf
 
 
7
  from scipy.io import wavfile
8
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
9
  from scipy import signal
 
 
 
10
 
11
  # Set device
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  print(f"Using device: {device}")
14
 
15
- class VoiceConverter:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def __init__(self):
17
  # Load wav2vec model for audio feature extraction
18
  self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
19
  self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
20
 
21
- # Parameters for voice conversion
22
- self.female_pitch_shift = 2.0 # Shift pitch up for female voice
23
- self.female_formant_shift = 1.2 # Adjust formants for female voice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def extract_features(self, audio, sample_rate):
26
  """Extract audio features using wav2vec2"""
@@ -38,8 +77,66 @@ class VoiceConverter:
38
  hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
39
  return hidden_states
40
 
41
- def convert_to_female(self, audio_path, output_path):
42
- """Convert voice from male to female"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  try:
44
  # Load audio file
45
  audio, sample_rate = librosa.load(audio_path, sr=None)
@@ -48,46 +145,88 @@ class VoiceConverter:
48
  if len(audio.shape) > 1:
49
  audio = librosa.to_mono(audio)
50
 
51
- # Extract pitch using librosa
52
- pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
 
 
 
 
 
 
 
 
53
 
54
- # Apply pitch shifting for female voice
55
- audio_female = librosa.effects.pitch_shift(
 
 
 
56
  audio,
57
  sr=sample_rate,
58
- n_steps=self.female_pitch_shift
59
  )
60
 
61
- # Apply formant shifting using a simple method - resample and scale back
62
- y_stretched = librosa.effects.time_stretch(audio_female, rate=self.female_formant_shift)
63
- audio_female_formant = librosa.resample(
64
- y_stretched,
65
- orig_sr=sample_rate,
66
- target_sr=int(sample_rate * self.female_formant_shift)
67
- )
68
- audio_female_formant = librosa.resample(
69
- audio_female_formant,
70
- orig_sr=int(sample_rate * self.female_formant_shift),
71
- target_sr=sample_rate
72
- )
 
 
 
 
 
 
 
 
73
 
74
- # Match the length with the original
75
- min_len = min(len(audio), len(audio_female_formant))
76
- audio_female_formant = audio_female_formant[:min_len]
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Save the result
79
- sf.write(output_path, audio_female_formant, sample_rate)
 
 
 
 
 
 
 
 
80
  return output_path
81
 
82
  except Exception as e:
83
  print(f"Error during conversion: {e}")
 
 
84
  return None
85
 
86
  # Initialize voice converter
87
- voice_converter = VoiceConverter()
88
 
89
  # Create Gradio interface
90
- def convert_voice(audio_file):
91
  """Function to handle the Gradio interface"""
92
  # Create a temporary file path for the output
93
  input_filename = os.path.basename(audio_file)
@@ -95,7 +234,7 @@ def convert_voice(audio_file):
95
  output_path = os.path.join(os.path.dirname(audio_file), output_filename)
96
 
97
  # Perform voice conversion
98
- result = voice_converter.convert_to_female(audio_file, output_path)
99
 
100
  if result:
101
  return result
@@ -105,12 +244,32 @@ def convert_voice(audio_file):
105
  # Define the Gradio interface
106
  demo = gr.Interface(
107
  fn=convert_voice,
108
- inputs=gr.Audio(type="filepath", label="Upload Male Voice Audio"),
 
 
 
 
 
 
 
 
109
  outputs=gr.Audio(label="Converted Female Voice"),
110
- title="Voice Gender Conversion (Male to Female)",
111
- description="Upload an audio file with a male voice to convert it to a female voice using AI.",
112
- examples=[["sample1.wav"], ["sample2.wav"]],
113
- theme=gr.themes.Soft()
 
 
 
 
 
 
 
 
 
 
 
 
114
  )
115
 
116
  # Launch the app
 
4
  import gradio as gr
5
  import librosa
6
  import soundfile as sf
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
  from scipy.io import wavfile
 
10
  from scipy import signal
11
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
12
+ from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
13
+ from pedalboard.io import AudioFile
14
 
15
  # Set device
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  print(f"Using device: {device}")
18
 
19
+ class VoiceEncoder(nn.Module):
20
+ """Voice embedding network to extract speaker identity features"""
21
+ def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
22
+ super(VoiceEncoder, self).__init__()
23
+ self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
24
+ self.linear = nn.Linear(hidden_dim*2, embedding_dim)
25
+
26
+ def forward(self, x):
27
+ self.lstm.flatten_parameters()
28
+ x, _ = self.lstm(x)
29
+ x = self.linear(x[:, -1, :]) # Take last timestep
30
+ x = F.normalize(x, p=2, dim=1) # L2 normalization
31
+ return x
32
+
33
+ class AdvancedVoiceConverter:
34
  def __init__(self):
35
  # Load wav2vec model for audio feature extraction
36
  self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
37
  self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
38
 
39
+ # Initialize voice encoder (placeholder - not fully implemented)
40
+ self.voice_encoder = VoiceEncoder().to(device)
41
+
42
+ # Voice modification parameters
43
+ self.female_pitch_factors = {
44
+ 'low': 1.5, # Slight pitch increase
45
+ 'medium': 2.5, # Moderate pitch increase
46
+ 'high': 3.5 # Significant pitch increase
47
+ }
48
+
49
+ self.female_formant_factors = {
50
+ 'low': 1.15,
51
+ 'medium': 1.25,
52
+ 'high': 1.35
53
+ }
54
+
55
+ # Load target female voice statistics (these would normally be learned from data)
56
+ self.female_stats = {
57
+ 'pitch_mean': 220.0, # Hz (typical female fundamental frequency)
58
+ 'pitch_std': 30.0,
59
+ 'formant1_mean': 850.0, # Hz (typical female first formant)
60
+ 'formant2_mean': 2200.0, # Hz (typical female second formant)
61
+ 'formant3_mean': 3000.0, # Hz (typical female third formant)
62
+ }
63
 
64
  def extract_features(self, audio, sample_rate):
65
  """Extract audio features using wav2vec2"""
 
77
  hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
78
  return hidden_states
79
 
80
+ def extract_pitch_envelope(self, audio, sample_rate):
81
+ """Extract pitch and envelope information"""
82
+ # Extract pitch using librosa
83
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
84
+ pitch_envelope = []
85
+
86
+ # Find the most dominant pitch at each frame
87
+ for i in range(pitches.shape[1]):
88
+ index = magnitudes[:, i].argmax()
89
+ pitch = pitches[index, i]
90
+ pitch_envelope.append(pitch if pitch > 0 else 0)
91
+
92
+ return np.array(pitch_envelope)
93
+
94
+ def extract_formants(self, audio, sample_rate, n_formants=3):
95
+ """Extract formant frequencies using Linear Prediction Coefficients"""
96
+ # Pre-emphasis to amplify higher frequencies
97
+ audio_pre = librosa.effects.preemphasis(audio)
98
+
99
+ # Get LPC coefficients
100
+ order = 2 + sample_rate // 1000 # Rule of thumb for LPC order
101
+ lpc = librosa.lpc(audio_pre, order=order)
102
+
103
+ # Get roots of the LPC polynomial
104
+ roots = np.polynomial.polynomial.polyroots(lpc)
105
+
106
+ # Keep only roots with positive imaginary part
107
+ roots = roots[np.imag(roots) > 0]
108
+
109
+ # Convert to frequencies
110
+ angles = np.arctan2(np.imag(roots), np.real(roots))
111
+ formants = angles * (sample_rate / (2 * np.pi))
112
+
113
+ # Sort and return the first n_formants
114
+ formants = sorted(formants)[:n_formants]
115
+ return np.array(formants)
116
+
117
+ def apply_voice_effects(self, audio, sample_rate):
118
+ """Apply audio effects to enhance the feminine quality of the voice"""
119
+ # Create a pedalboard with effects
120
+ board = Pedalboard([
121
+ # Subtle compression to even out dynamics
122
+ Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),
123
+
124
+ # Phaser for a slightly breathier quality
125
+ Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),
126
+
127
+ # Filter to enhance higher frequencies
128
+ LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),
129
+
130
+ # Add a subtle reverb for smoothness
131
+ Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
132
+ ])
133
+
134
+ # Apply effects
135
+ effected_audio = board(audio, sample_rate)
136
+ return effected_audio
137
+
138
+ def convert_to_female(self, audio_path, output_path, intensity='medium'):
139
+ """Convert voice from male to female with adjustable intensity"""
140
  try:
141
  # Load audio file
142
  audio, sample_rate = librosa.load(audio_path, sr=None)
 
145
  if len(audio.shape) > 1:
146
  audio = librosa.to_mono(audio)
147
 
148
+ # Get pitch and formant shift factors based on intensity
149
+ pitch_factor = self.female_pitch_factors[intensity]
150
+ formant_factor = self.female_formant_factors[intensity]
151
+
152
+ # Extract pitch contour and formants
153
+ pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
154
+ formants = self.extract_formants(audio, sample_rate)
155
+
156
+ print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
157
+ print(f"Original formants: {formants} Hz")
158
 
159
+ # Apply time-frequency domain transformation (WORLD or PSOLA would be better,
160
+ # but using a simpler approach for demonstration)
161
+
162
+ # 1. Apply pitch shifting
163
+ audio_pitched = librosa.effects.pitch_shift(
164
  audio,
165
  sr=sample_rate,
166
+ n_steps=pitch_factor
167
  )
168
 
169
+ # 2. Apply formant shifting using a more sophisticated approach
170
+ # First, split audio into harmonic and percussive components
171
+ harmonic, percussive = librosa.effects.hpss(audio_pitched)
172
+
173
+ # Apply formant transformation to harmonic component
174
+ n_fft = 2048
175
+ hop_length = 512
176
+
177
+ # Get spectrogram
178
+ D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)
179
+
180
+ # Compress/stretch frequency axis to shift formants
181
+ freq_bins = D.shape[0]
182
+
183
+ # Create a warping matrix for formant shifting
184
+ warp_matrix = np.zeros((freq_bins, freq_bins))
185
+ for i in range(freq_bins):
186
+ target_bin = int(i / formant_factor)
187
+ if target_bin < freq_bins:
188
+ warp_matrix[i, target_bin] = 1
189
 
190
+ # Apply the frequency warping
191
+ D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))
192
+
193
+ # Convert back to time domain
194
+ harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)
195
+
196
+ # Ensure both components have the same length
197
+ min_len = min(len(harmonic_formant_shifted), len(percussive))
198
+ harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
199
+ percussive = percussive[:min_len]
200
+
201
+ # Recombine harmonic and percussive parts
202
+ audio_transformed = harmonic_formant_shifted + 0.8 * percussive
203
+
204
+ # Apply audio effects to enhance feminine qualities
205
+ audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)
206
 
207
  # Save the result
208
+ sf.write(output_path, audio_enhanced, sample_rate)
209
+
210
+ # Extract post-conversion stats for logging
211
+ pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
212
+ formants_after = self.extract_formants(audio_enhanced, sample_rate)
213
+
214
+ print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
215
+ print(f"Converted formants: {formants_after} Hz")
216
+
217
  return output_path
218
 
219
  except Exception as e:
220
  print(f"Error during conversion: {e}")
221
+ import traceback
222
+ traceback.print_exc()
223
  return None
224
 
225
  # Initialize voice converter
226
+ voice_converter = AdvancedVoiceConverter()
227
 
228
  # Create Gradio interface
229
+ def convert_voice(audio_file, intensity):
230
  """Function to handle the Gradio interface"""
231
  # Create a temporary file path for the output
232
  input_filename = os.path.basename(audio_file)
 
234
  output_path = os.path.join(os.path.dirname(audio_file), output_filename)
235
 
236
  # Perform voice conversion
237
+ result = voice_converter.convert_to_female(audio_file, output_path, intensity)
238
 
239
  if result:
240
  return result
 
244
  # Define the Gradio interface
245
  demo = gr.Interface(
246
  fn=convert_voice,
247
+ inputs=[
248
+ gr.Audio(type="filepath", label="Upload Voice Audio"),
249
+ gr.Radio(
250
+ ["low", "medium", "high"],
251
+ label="Feminization Intensity",
252
+ value="medium",
253
+ info="Choose how much to feminize the voice"
254
+ )
255
+ ],
256
  outputs=gr.Audio(label="Converted Female Voice"),
257
+ title="Advanced Voice Gender Conversion",
258
+ description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
259
+ examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
260
+ theme=gr.themes.Soft(),
261
+ article="""
262
+ ## How This Works
263
+
264
+ This application uses several advanced techniques to convert voices to sound more feminine:
265
+
266
+ 1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges
267
+ 2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices
268
+ 3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities
269
+ 4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results
270
+
271
+ The 'Feminization Intensity' lets you control how dramatic the transformation should be.
272
+ """
273
  )
274
 
275
  # Launch the app