Update app.py
Browse files
app.py
CHANGED
@@ -4,23 +4,62 @@ import numpy as np
|
|
4 |
import gradio as gr
|
5 |
import librosa
|
6 |
import soundfile as sf
|
|
|
|
|
7 |
from scipy.io import wavfile
|
8 |
-
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
9 |
from scipy import signal
|
|
|
|
|
|
|
10 |
|
11 |
# Set device
|
12 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
13 |
print(f"Using device: {device}")
|
14 |
|
15 |
-
class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def __init__(self):
|
17 |
# Load wav2vec model for audio feature extraction
|
18 |
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
19 |
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
|
20 |
|
21 |
-
#
|
22 |
-
self.
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def extract_features(self, audio, sample_rate):
|
26 |
"""Extract audio features using wav2vec2"""
|
@@ -38,8 +77,66 @@ class VoiceConverter:
|
|
38 |
hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
|
39 |
return hidden_states
|
40 |
|
41 |
-
def
|
42 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
try:
|
44 |
# Load audio file
|
45 |
audio, sample_rate = librosa.load(audio_path, sr=None)
|
@@ -48,46 +145,88 @@ class VoiceConverter:
|
|
48 |
if len(audio.shape) > 1:
|
49 |
audio = librosa.to_mono(audio)
|
50 |
|
51 |
-
#
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
# Apply
|
55 |
-
|
|
|
|
|
|
|
56 |
audio,
|
57 |
sr=sample_rate,
|
58 |
-
n_steps=
|
59 |
)
|
60 |
|
61 |
-
# Apply formant shifting using a
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
#
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# Save the result
|
79 |
-
sf.write(output_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
return output_path
|
81 |
|
82 |
except Exception as e:
|
83 |
print(f"Error during conversion: {e}")
|
|
|
|
|
84 |
return None
|
85 |
|
86 |
# Initialize voice converter
|
87 |
-
voice_converter =
|
88 |
|
89 |
# Create Gradio interface
|
90 |
-
def convert_voice(audio_file):
|
91 |
"""Function to handle the Gradio interface"""
|
92 |
# Create a temporary file path for the output
|
93 |
input_filename = os.path.basename(audio_file)
|
@@ -95,7 +234,7 @@ def convert_voice(audio_file):
|
|
95 |
output_path = os.path.join(os.path.dirname(audio_file), output_filename)
|
96 |
|
97 |
# Perform voice conversion
|
98 |
-
result = voice_converter.convert_to_female(audio_file, output_path)
|
99 |
|
100 |
if result:
|
101 |
return result
|
@@ -105,12 +244,32 @@ def convert_voice(audio_file):
|
|
105 |
# Define the Gradio interface
|
106 |
demo = gr.Interface(
|
107 |
fn=convert_voice,
|
108 |
-
inputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
outputs=gr.Audio(label="Converted Female Voice"),
|
110 |
-
title="Voice Gender Conversion
|
111 |
-
description="Upload an audio file
|
112 |
-
examples=[["sample1.wav"], ["sample2.wav"]],
|
113 |
-
theme=gr.themes.Soft()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
)
|
115 |
|
116 |
# Launch the app
|
|
|
4 |
import gradio as gr
|
5 |
import librosa
|
6 |
import soundfile as sf
|
7 |
+
import torch.nn as nn
|
8 |
+
import torch.nn.functional as F
|
9 |
from scipy.io import wavfile
|
|
|
10 |
from scipy import signal
|
11 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
12 |
+
from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
|
13 |
+
from pedalboard.io import AudioFile
|
14 |
|
15 |
# Set device
|
16 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
17 |
print(f"Using device: {device}")
|
18 |
|
19 |
+
class VoiceEncoder(nn.Module):
|
20 |
+
"""Voice embedding network to extract speaker identity features"""
|
21 |
+
def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
|
22 |
+
super(VoiceEncoder, self).__init__()
|
23 |
+
self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
|
24 |
+
self.linear = nn.Linear(hidden_dim*2, embedding_dim)
|
25 |
+
|
26 |
+
def forward(self, x):
|
27 |
+
self.lstm.flatten_parameters()
|
28 |
+
x, _ = self.lstm(x)
|
29 |
+
x = self.linear(x[:, -1, :]) # Take last timestep
|
30 |
+
x = F.normalize(x, p=2, dim=1) # L2 normalization
|
31 |
+
return x
|
32 |
+
|
33 |
+
class AdvancedVoiceConverter:
|
34 |
def __init__(self):
|
35 |
# Load wav2vec model for audio feature extraction
|
36 |
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
37 |
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
|
38 |
|
39 |
+
# Initialize voice encoder (placeholder - not fully implemented)
|
40 |
+
self.voice_encoder = VoiceEncoder().to(device)
|
41 |
+
|
42 |
+
# Voice modification parameters
|
43 |
+
self.female_pitch_factors = {
|
44 |
+
'low': 1.5, # Slight pitch increase
|
45 |
+
'medium': 2.5, # Moderate pitch increase
|
46 |
+
'high': 3.5 # Significant pitch increase
|
47 |
+
}
|
48 |
+
|
49 |
+
self.female_formant_factors = {
|
50 |
+
'low': 1.15,
|
51 |
+
'medium': 1.25,
|
52 |
+
'high': 1.35
|
53 |
+
}
|
54 |
+
|
55 |
+
# Load target female voice statistics (these would normally be learned from data)
|
56 |
+
self.female_stats = {
|
57 |
+
'pitch_mean': 220.0, # Hz (typical female fundamental frequency)
|
58 |
+
'pitch_std': 30.0,
|
59 |
+
'formant1_mean': 850.0, # Hz (typical female first formant)
|
60 |
+
'formant2_mean': 2200.0, # Hz (typical female second formant)
|
61 |
+
'formant3_mean': 3000.0, # Hz (typical female third formant)
|
62 |
+
}
|
63 |
|
64 |
def extract_features(self, audio, sample_rate):
|
65 |
"""Extract audio features using wav2vec2"""
|
|
|
77 |
hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
|
78 |
return hidden_states
|
79 |
|
80 |
+
def extract_pitch_envelope(self, audio, sample_rate):
|
81 |
+
"""Extract pitch and envelope information"""
|
82 |
+
# Extract pitch using librosa
|
83 |
+
pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
|
84 |
+
pitch_envelope = []
|
85 |
+
|
86 |
+
# Find the most dominant pitch at each frame
|
87 |
+
for i in range(pitches.shape[1]):
|
88 |
+
index = magnitudes[:, i].argmax()
|
89 |
+
pitch = pitches[index, i]
|
90 |
+
pitch_envelope.append(pitch if pitch > 0 else 0)
|
91 |
+
|
92 |
+
return np.array(pitch_envelope)
|
93 |
+
|
94 |
+
def extract_formants(self, audio, sample_rate, n_formants=3):
|
95 |
+
"""Extract formant frequencies using Linear Prediction Coefficients"""
|
96 |
+
# Pre-emphasis to amplify higher frequencies
|
97 |
+
audio_pre = librosa.effects.preemphasis(audio)
|
98 |
+
|
99 |
+
# Get LPC coefficients
|
100 |
+
order = 2 + sample_rate // 1000 # Rule of thumb for LPC order
|
101 |
+
lpc = librosa.lpc(audio_pre, order=order)
|
102 |
+
|
103 |
+
# Get roots of the LPC polynomial
|
104 |
+
roots = np.polynomial.polynomial.polyroots(lpc)
|
105 |
+
|
106 |
+
# Keep only roots with positive imaginary part
|
107 |
+
roots = roots[np.imag(roots) > 0]
|
108 |
+
|
109 |
+
# Convert to frequencies
|
110 |
+
angles = np.arctan2(np.imag(roots), np.real(roots))
|
111 |
+
formants = angles * (sample_rate / (2 * np.pi))
|
112 |
+
|
113 |
+
# Sort and return the first n_formants
|
114 |
+
formants = sorted(formants)[:n_formants]
|
115 |
+
return np.array(formants)
|
116 |
+
|
117 |
+
def apply_voice_effects(self, audio, sample_rate):
|
118 |
+
"""Apply audio effects to enhance the feminine quality of the voice"""
|
119 |
+
# Create a pedalboard with effects
|
120 |
+
board = Pedalboard([
|
121 |
+
# Subtle compression to even out dynamics
|
122 |
+
Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),
|
123 |
+
|
124 |
+
# Phaser for a slightly breathier quality
|
125 |
+
Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),
|
126 |
+
|
127 |
+
# Filter to enhance higher frequencies
|
128 |
+
LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),
|
129 |
+
|
130 |
+
# Add a subtle reverb for smoothness
|
131 |
+
Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
|
132 |
+
])
|
133 |
+
|
134 |
+
# Apply effects
|
135 |
+
effected_audio = board(audio, sample_rate)
|
136 |
+
return effected_audio
|
137 |
+
|
138 |
+
def convert_to_female(self, audio_path, output_path, intensity='medium'):
|
139 |
+
"""Convert voice from male to female with adjustable intensity"""
|
140 |
try:
|
141 |
# Load audio file
|
142 |
audio, sample_rate = librosa.load(audio_path, sr=None)
|
|
|
145 |
if len(audio.shape) > 1:
|
146 |
audio = librosa.to_mono(audio)
|
147 |
|
148 |
+
# Get pitch and formant shift factors based on intensity
|
149 |
+
pitch_factor = self.female_pitch_factors[intensity]
|
150 |
+
formant_factor = self.female_formant_factors[intensity]
|
151 |
+
|
152 |
+
# Extract pitch contour and formants
|
153 |
+
pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
|
154 |
+
formants = self.extract_formants(audio, sample_rate)
|
155 |
+
|
156 |
+
print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
|
157 |
+
print(f"Original formants: {formants} Hz")
|
158 |
|
159 |
+
# Apply time-frequency domain transformation (WORLD or PSOLA would be better,
|
160 |
+
# but using a simpler approach for demonstration)
|
161 |
+
|
162 |
+
# 1. Apply pitch shifting
|
163 |
+
audio_pitched = librosa.effects.pitch_shift(
|
164 |
audio,
|
165 |
sr=sample_rate,
|
166 |
+
n_steps=pitch_factor
|
167 |
)
|
168 |
|
169 |
+
# 2. Apply formant shifting using a more sophisticated approach
|
170 |
+
# First, split audio into harmonic and percussive components
|
171 |
+
harmonic, percussive = librosa.effects.hpss(audio_pitched)
|
172 |
+
|
173 |
+
# Apply formant transformation to harmonic component
|
174 |
+
n_fft = 2048
|
175 |
+
hop_length = 512
|
176 |
+
|
177 |
+
# Get spectrogram
|
178 |
+
D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)
|
179 |
+
|
180 |
+
# Compress/stretch frequency axis to shift formants
|
181 |
+
freq_bins = D.shape[0]
|
182 |
+
|
183 |
+
# Create a warping matrix for formant shifting
|
184 |
+
warp_matrix = np.zeros((freq_bins, freq_bins))
|
185 |
+
for i in range(freq_bins):
|
186 |
+
target_bin = int(i / formant_factor)
|
187 |
+
if target_bin < freq_bins:
|
188 |
+
warp_matrix[i, target_bin] = 1
|
189 |
|
190 |
+
# Apply the frequency warping
|
191 |
+
D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))
|
192 |
+
|
193 |
+
# Convert back to time domain
|
194 |
+
harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)
|
195 |
+
|
196 |
+
# Ensure both components have the same length
|
197 |
+
min_len = min(len(harmonic_formant_shifted), len(percussive))
|
198 |
+
harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
|
199 |
+
percussive = percussive[:min_len]
|
200 |
+
|
201 |
+
# Recombine harmonic and percussive parts
|
202 |
+
audio_transformed = harmonic_formant_shifted + 0.8 * percussive
|
203 |
+
|
204 |
+
# Apply audio effects to enhance feminine qualities
|
205 |
+
audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)
|
206 |
|
207 |
# Save the result
|
208 |
+
sf.write(output_path, audio_enhanced, sample_rate)
|
209 |
+
|
210 |
+
# Extract post-conversion stats for logging
|
211 |
+
pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
|
212 |
+
formants_after = self.extract_formants(audio_enhanced, sample_rate)
|
213 |
+
|
214 |
+
print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
|
215 |
+
print(f"Converted formants: {formants_after} Hz")
|
216 |
+
|
217 |
return output_path
|
218 |
|
219 |
except Exception as e:
|
220 |
print(f"Error during conversion: {e}")
|
221 |
+
import traceback
|
222 |
+
traceback.print_exc()
|
223 |
return None
|
224 |
|
225 |
# Initialize voice converter
|
226 |
+
voice_converter = AdvancedVoiceConverter()
|
227 |
|
228 |
# Create Gradio interface
|
229 |
+
def convert_voice(audio_file, intensity):
|
230 |
"""Function to handle the Gradio interface"""
|
231 |
# Create a temporary file path for the output
|
232 |
input_filename = os.path.basename(audio_file)
|
|
|
234 |
output_path = os.path.join(os.path.dirname(audio_file), output_filename)
|
235 |
|
236 |
# Perform voice conversion
|
237 |
+
result = voice_converter.convert_to_female(audio_file, output_path, intensity)
|
238 |
|
239 |
if result:
|
240 |
return result
|
|
|
244 |
# Define the Gradio interface
|
245 |
demo = gr.Interface(
|
246 |
fn=convert_voice,
|
247 |
+
inputs=[
|
248 |
+
gr.Audio(type="filepath", label="Upload Voice Audio"),
|
249 |
+
gr.Radio(
|
250 |
+
["low", "medium", "high"],
|
251 |
+
label="Feminization Intensity",
|
252 |
+
value="medium",
|
253 |
+
info="Choose how much to feminize the voice"
|
254 |
+
)
|
255 |
+
],
|
256 |
outputs=gr.Audio(label="Converted Female Voice"),
|
257 |
+
title="Advanced Voice Gender Conversion",
|
258 |
+
description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
|
259 |
+
examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
|
260 |
+
theme=gr.themes.Soft(),
|
261 |
+
article="""
|
262 |
+
## How This Works
|
263 |
+
|
264 |
+
This application uses several advanced techniques to convert voices to sound more feminine:
|
265 |
+
|
266 |
+
1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges
|
267 |
+
2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices
|
268 |
+
3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities
|
269 |
+
4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results
|
270 |
+
|
271 |
+
The 'Feminization Intensity' lets you control how dramatic the transformation should be.
|
272 |
+
"""
|
273 |
)
|
274 |
|
275 |
# Launch the app
|