import os import torch import numpy as np import gradio as gr import librosa import soundfile as sf import torch.nn as nn import torch.nn.functional as F from scipy.io import wavfile from scipy import signal from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb from pedalboard.io import AudioFile # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") class VoiceEncoder(nn.Module): """Voice embedding network to extract speaker identity features""" def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128): super(VoiceEncoder, self).__init__() self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True) self.linear = nn.Linear(hidden_dim*2, embedding_dim) def forward(self, x): self.lstm.flatten_parameters() x, _ = self.lstm(x) x = self.linear(x[:, -1, :]) # Take last timestep x = F.normalize(x, p=2, dim=1) # L2 normalization return x class AdvancedVoiceConverter: def __init__(self): # Load wav2vec model for audio feature extraction self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device) # Initialize voice encoder (placeholder - not fully implemented) self.voice_encoder = VoiceEncoder().to(device) # Voice modification parameters self.female_pitch_factors = { 'low': 1.5, # Slight pitch increase 'medium': 2.5, # Moderate pitch increase 'high': 3.5 # Significant pitch increase } self.female_formant_factors = { 'low': 1.15, 'medium': 1.25, 'high': 1.35 } # Load target female voice statistics (these would normally be learned from data) self.female_stats = { 'pitch_mean': 220.0, # Hz (typical female fundamental frequency) 'pitch_std': 30.0, 'formant1_mean': 850.0, # Hz (typical female first formant) 'formant2_mean': 2200.0, # Hz (typical female second formant) 'formant3_mean': 3000.0, # Hz (typical female third formant) } def extract_features(self, audio, sample_rate): """Extract audio features using wav2vec2""" # Resample if needed if sample_rate != 16000: audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) sample_rate = 16000 # Extract features inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device) with torch.no_grad(): outputs = self.model(**inputs) # Get hidden states (features) hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy() return hidden_states def extract_pitch_envelope(self, audio, sample_rate): """Extract pitch and envelope information""" # Extract pitch using librosa pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate) pitch_envelope = [] # Find the most dominant pitch at each frame for i in range(pitches.shape[1]): index = magnitudes[:, i].argmax() pitch = pitches[index, i] pitch_envelope.append(pitch if pitch > 0 else 0) return np.array(pitch_envelope) def extract_formants(self, audio, sample_rate, n_formants=3): """Extract formant frequencies using Linear Prediction Coefficients""" # Pre-emphasis to amplify higher frequencies audio_pre = librosa.effects.preemphasis(audio) # Get LPC coefficients order = 2 + sample_rate // 1000 # Rule of thumb for LPC order lpc = librosa.lpc(audio_pre, order=order) # Get roots of the LPC polynomial roots = np.polynomial.polynomial.polyroots(lpc) # Keep only roots with positive imaginary part roots = roots[np.imag(roots) > 0] # Convert to frequencies angles = np.arctan2(np.imag(roots), np.real(roots)) formants = angles * (sample_rate / (2 * np.pi)) # Sort and return the first n_formants formants = sorted(formants)[:n_formants] return np.array(formants) def apply_voice_effects(self, audio, sample_rate): """Apply audio effects to enhance the feminine quality of the voice""" # Create a pedalboard with effects board = Pedalboard([ # Subtle compression to even out dynamics Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100), # Phaser for a slightly breathier quality Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15), # Filter to enhance higher frequencies LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1), # Add a subtle reverb for smoothness Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92) ]) # Apply effects effected_audio = board(audio, sample_rate) return effected_audio def convert_to_female(self, audio_path, output_path, intensity='medium'): """Convert voice from male to female with adjustable intensity""" try: # Load audio file audio, sample_rate = librosa.load(audio_path, sr=None) # Convert to mono if stereo if len(audio.shape) > 1: audio = librosa.to_mono(audio) # Get pitch and formant shift factors based on intensity pitch_factor = self.female_pitch_factors[intensity] formant_factor = self.female_formant_factors[intensity] # Extract pitch contour and formants pitch_envelope = self.extract_pitch_envelope(audio, sample_rate) formants = self.extract_formants(audio, sample_rate) print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz") print(f"Original formants: {formants} Hz") # Apply time-frequency domain transformation (WORLD or PSOLA would be better, # but using a simpler approach for demonstration) # 1. Apply pitch shifting audio_pitched = librosa.effects.pitch_shift( audio, sr=sample_rate, n_steps=pitch_factor ) # 2. Apply formant shifting using a more sophisticated approach # First, split audio into harmonic and percussive components harmonic, percussive = librosa.effects.hpss(audio_pitched) # Apply formant transformation to harmonic component n_fft = 2048 hop_length = 512 # Get spectrogram D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length) # Compress/stretch frequency axis to shift formants freq_bins = D.shape[0] # Create a warping matrix for formant shifting warp_matrix = np.zeros((freq_bins, freq_bins)) for i in range(freq_bins): target_bin = int(i / formant_factor) if target_bin < freq_bins: warp_matrix[i, target_bin] = 1 # Apply the frequency warping D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D)) # Convert back to time domain harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length) # Ensure both components have the same length min_len = min(len(harmonic_formant_shifted), len(percussive)) harmonic_formant_shifted = harmonic_formant_shifted[:min_len] percussive = percussive[:min_len] # Recombine harmonic and percussive parts audio_transformed = harmonic_formant_shifted + 0.8 * percussive # Apply audio effects to enhance feminine qualities audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate) # Save the result sf.write(output_path, audio_enhanced, sample_rate) # Extract post-conversion stats for logging pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate) formants_after = self.extract_formants(audio_enhanced, sample_rate) print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz") print(f"Converted formants: {formants_after} Hz") return output_path except Exception as e: print(f"Error during conversion: {e}") import traceback traceback.print_exc() return None # Initialize voice converter voice_converter = AdvancedVoiceConverter() # Create Gradio interface def convert_voice(audio_file, intensity): """Function to handle the Gradio interface""" # Create a temporary file path for the output input_filename = os.path.basename(audio_file) output_filename = f"female_{input_filename}" output_path = os.path.join(os.path.dirname(audio_file), output_filename) # Perform voice conversion result = voice_converter.convert_to_female(audio_file, output_path, intensity) if result: return result else: return None # Define the Gradio interface demo = gr.Interface( fn=convert_voice, inputs=[ gr.Audio(type="filepath", label="Upload Voice Audio"), gr.Radio( ["low", "medium", "high"], label="Feminization Intensity", value="medium", info="Choose how much to feminize the voice" ) ], outputs=gr.Audio(label="Converted Female Voice"), title="Advanced Voice Gender Conversion", description="Upload an audio file to convert it to a female voice using advanced AI techniques.", examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]], theme=gr.themes.Soft(), article=""" ## How This Works This application uses several advanced techniques to convert voices to sound more feminine: 1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges 2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices 3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities 4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results The 'Feminization Intensity' lets you control how dramatic the transformation should be. """ ) # Launch the app if __name__ == "__main__": demo.launch(share=True)