File size: 11,301 Bytes
cab5167
 
 
 
 
 
170f3d8
 
cab5167
 
170f3d8
 
 
cab5167
 
 
 
 
170f3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cab5167
 
 
 
 
170f3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cab5167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170f3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cab5167
 
 
 
 
 
 
 
170f3d8
 
 
 
 
 
 
 
 
 
cab5167
170f3d8
 
 
 
 
cab5167
 
170f3d8
cab5167
 
170f3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cab5167
170f3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cab5167
 
170f3d8
 
 
 
 
 
 
 
 
cab5167
 
 
 
170f3d8
 
cab5167
 
 
170f3d8
cab5167
 
170f3d8
cab5167
 
 
 
 
 
 
170f3d8
cab5167
 
 
 
 
 
 
 
 
170f3d8
 
 
 
 
 
 
 
 
cab5167
170f3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cab5167
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import os
import torch
import numpy as np
import gradio as gr
import librosa
import soundfile as sf
import torch.nn as nn
import torch.nn.functional as F
from scipy.io import wavfile
from scipy import signal
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
from pedalboard.io import AudioFile

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class VoiceEncoder(nn.Module):
    """Voice embedding network to extract speaker identity features"""
    def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
        super(VoiceEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim*2, embedding_dim)
        
    def forward(self, x):
        self.lstm.flatten_parameters()
        x, _ = self.lstm(x)
        x = self.linear(x[:, -1, :])  # Take last timestep
        x = F.normalize(x, p=2, dim=1)  # L2 normalization
        return x

class AdvancedVoiceConverter:
    def __init__(self):
        # Load wav2vec model for audio feature extraction
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
        
        # Initialize voice encoder (placeholder - not fully implemented)
        self.voice_encoder = VoiceEncoder().to(device)
        
        # Voice modification parameters
        self.female_pitch_factors = {
            'low': 1.5,      # Slight pitch increase
            'medium': 2.5,   # Moderate pitch increase
            'high': 3.5      # Significant pitch increase
        }
        
        self.female_formant_factors = {
            'low': 1.15,
            'medium': 1.25,
            'high': 1.35
        }
        
        # Load target female voice statistics (these would normally be learned from data)
        self.female_stats = {
            'pitch_mean': 220.0,  # Hz (typical female fundamental frequency)
            'pitch_std': 30.0,
            'formant1_mean': 850.0,  # Hz (typical female first formant)
            'formant2_mean': 2200.0,  # Hz (typical female second formant)
            'formant3_mean': 3000.0,  # Hz (typical female third formant)
        }
        
    def extract_features(self, audio, sample_rate):
        """Extract audio features using wav2vec2"""
        # Resample if needed
        if sample_rate != 16000:
            audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
            sample_rate = 16000
            
        # Extract features
        inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Get hidden states (features)
        hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
        return hidden_states
    
    def extract_pitch_envelope(self, audio, sample_rate):
        """Extract pitch and envelope information"""
        # Extract pitch using librosa
        pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
        pitch_envelope = []
        
        # Find the most dominant pitch at each frame
        for i in range(pitches.shape[1]):
            index = magnitudes[:, i].argmax()
            pitch = pitches[index, i]
            pitch_envelope.append(pitch if pitch > 0 else 0)
            
        return np.array(pitch_envelope)
    
    def extract_formants(self, audio, sample_rate, n_formants=3):
        """Extract formant frequencies using Linear Prediction Coefficients"""
        # Pre-emphasis to amplify higher frequencies
        audio_pre = librosa.effects.preemphasis(audio)
        
        # Get LPC coefficients
        order = 2 + sample_rate // 1000  # Rule of thumb for LPC order
        lpc = librosa.lpc(audio_pre, order=order)
        
        # Get roots of the LPC polynomial
        roots = np.polynomial.polynomial.polyroots(lpc)
        
        # Keep only roots with positive imaginary part
        roots = roots[np.imag(roots) > 0]
        
        # Convert to frequencies
        angles = np.arctan2(np.imag(roots), np.real(roots))
        formants = angles * (sample_rate / (2 * np.pi))
        
        # Sort and return the first n_formants
        formants = sorted(formants)[:n_formants]
        return np.array(formants)
    
    def apply_voice_effects(self, audio, sample_rate):
        """Apply audio effects to enhance the feminine quality of the voice"""
        # Create a pedalboard with effects
        board = Pedalboard([
            # Subtle compression to even out dynamics
            Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),
            
            # Phaser for a slightly breathier quality
            Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),
            
            # Filter to enhance higher frequencies
            LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),
            
            # Add a subtle reverb for smoothness
            Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
        ])
        
        # Apply effects
        effected_audio = board(audio, sample_rate)
        return effected_audio
        
    def convert_to_female(self, audio_path, output_path, intensity='medium'):
        """Convert voice from male to female with adjustable intensity"""
        try:
            # Load audio file
            audio, sample_rate = librosa.load(audio_path, sr=None)
            
            # Convert to mono if stereo
            if len(audio.shape) > 1:
                audio = librosa.to_mono(audio)
            
            # Get pitch and formant shift factors based on intensity
            pitch_factor = self.female_pitch_factors[intensity]
            formant_factor = self.female_formant_factors[intensity]
            
            # Extract pitch contour and formants
            pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
            formants = self.extract_formants(audio, sample_rate)
            
            print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
            print(f"Original formants: {formants} Hz")
            
            # Apply time-frequency domain transformation (WORLD or PSOLA would be better,
            # but using a simpler approach for demonstration)
            
            # 1. Apply pitch shifting
            audio_pitched = librosa.effects.pitch_shift(
                audio, 
                sr=sample_rate, 
                n_steps=pitch_factor
            )
            
            # 2. Apply formant shifting using a more sophisticated approach
            # First, split audio into harmonic and percussive components
            harmonic, percussive = librosa.effects.hpss(audio_pitched)
            
            # Apply formant transformation to harmonic component
            n_fft = 2048
            hop_length = 512
            
            # Get spectrogram
            D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)
            
            # Compress/stretch frequency axis to shift formants
            freq_bins = D.shape[0]
            
            # Create a warping matrix for formant shifting
            warp_matrix = np.zeros((freq_bins, freq_bins))
            for i in range(freq_bins):
                target_bin = int(i / formant_factor)
                if target_bin < freq_bins:
                    warp_matrix[i, target_bin] = 1
            
            # Apply the frequency warping
            D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))
            
            # Convert back to time domain
            harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)
            
            # Ensure both components have the same length
            min_len = min(len(harmonic_formant_shifted), len(percussive))
            harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
            percussive = percussive[:min_len]
            
            # Recombine harmonic and percussive parts
            audio_transformed = harmonic_formant_shifted + 0.8 * percussive
            
            # Apply audio effects to enhance feminine qualities
            audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)
            
            # Save the result
            sf.write(output_path, audio_enhanced, sample_rate)
            
            # Extract post-conversion stats for logging
            pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
            formants_after = self.extract_formants(audio_enhanced, sample_rate)
            
            print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
            print(f"Converted formants: {formants_after} Hz")
            
            return output_path
            
        except Exception as e:
            print(f"Error during conversion: {e}")
            import traceback
            traceback.print_exc()
            return None

# Initialize voice converter
voice_converter = AdvancedVoiceConverter()

# Create Gradio interface
def convert_voice(audio_file, intensity):
    """Function to handle the Gradio interface"""
    # Create a temporary file path for the output
    input_filename = os.path.basename(audio_file)
    output_filename = f"female_{input_filename}"
    output_path = os.path.join(os.path.dirname(audio_file), output_filename)
    
    # Perform voice conversion
    result = voice_converter.convert_to_female(audio_file, output_path, intensity)
    
    if result:
        return result
    else:
        return None

# Define the Gradio interface
demo = gr.Interface(
    fn=convert_voice,
    inputs=[
        gr.Audio(type="filepath", label="Upload Voice Audio"),
        gr.Radio(
            ["low", "medium", "high"], 
            label="Feminization Intensity", 
            value="medium",
            info="Choose how much to feminize the voice"
        )
    ],
    outputs=gr.Audio(label="Converted Female Voice"),
    title="Advanced Voice Gender Conversion",
    description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
    examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
    theme=gr.themes.Soft(),
    article="""
    ## How This Works
    
    This application uses several advanced techniques to convert voices to sound more feminine:
    
    1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges
    2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices
    3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities
    4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results
    
    The 'Feminization Intensity' lets you control how dramatic the transformation should be.
    """
)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)