Spaces:

Athspi
/

Gggg

Sleeping

File size: 11,301 Bytes

import os
import torch
import numpy as np
import gradio as gr
import librosa
import soundfile as sf
import torch.nn as nn
import torch.nn.functional as F
from scipy.io import wavfile
from scipy import signal
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
from pedalboard.io import AudioFile

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class VoiceEncoder(nn.Module):
    """Voice embedding network to extract speaker identity features"""
    def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
        super(VoiceEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim*2, embedding_dim)
        
    def forward(self, x):
        self.lstm.flatten_parameters()
        x, _ = self.lstm(x)
        x = self.linear(x[:, -1, :])  # Take last timestep
        x = F.normalize(x, p=2, dim=1)  # L2 normalization
        return x

class AdvancedVoiceConverter:
    def __init__(self):
        # Load wav2vec model for audio feature extraction
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
        
        # Initialize voice encoder (placeholder - not fully implemented)
        self.voice_encoder = VoiceEncoder().to(device)
        
        # Voice modification parameters
        self.female_pitch_factors = {
            'low': 1.5,      # Slight pitch increase
            'medium': 2.5,   # Moderate pitch increase
            'high': 3.5      # Significant pitch increase
        }
        
        self.female_formant_factors = {
            'low': 1.15,
            'medium': 1.25,
            'high': 1.35
        }
        
        # Load target female voice statistics (these would normally be learned from data)
        self.female_stats = {
            'pitch_mean': 220.0,  # Hz (typical female fundamental frequency)
            'pitch_std': 30.0,
            'formant1_mean': 850.0,  # Hz (typical female first formant)
            'formant2_mean': 2200.0,  # Hz (typical female second formant)
            'formant3_mean': 3000.0,  # Hz (typical female third formant)
        }
        
    def extract_features(self, audio, sample_rate):
        """Extract audio features using wav2vec2"""
        # Resample if needed
        if sample_rate != 16000:
            audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
            sample_rate = 16000
            
        # Extract features
        inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Get hidden states (features)
        hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
        return hidden_states
    
    def extract_pitch_envelope(self, audio, sample_rate):
        """Extract pitch and envelope information"""
        # Extract pitch using librosa
        pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
        pitch_envelope = []
        
        # Find the most dominant pitch at each frame
        for i in range(pitches.shape[1]):
            index = magnitudes[:, i].argmax()
            pitch = pitches[index, i]
            pitch_envelope.append(pitch if pitch > 0 else 0)
            
        return np.array(pitch_envelope)
    
    def extract_formants(self, audio, sample_rate, n_formants=3):
        """Extract formant frequencies using Linear Prediction Coefficients"""
        # Pre-emphasis to amplify higher frequencies
        audio_pre = librosa.effects.preemphasis(audio)
        
        # Get LPC coefficients
        order = 2 + sample_rate // 1000  # Rule of thumb for LPC order
        lpc = librosa.lpc(audio_pre, order=order)
        
        # Get roots of the LPC polynomial
        roots = np.polynomial.polynomial.polyroots(lpc)
        
        # Keep only roots with positive imaginary part
        roots = roots[np.imag(roots) > 0]
        
        # Convert to frequencies
        angles = np.arctan2(np.imag(roots), np.real(roots))
        formants = angles * (sample_rate / (2 * np.pi))
        
        # Sort and return the first n_formants
        formants = sorted(formants)[:n_formants]
        return np.array(formants)
    
    def apply_voice_effects(self, audio, sample_rate):
        """Apply audio effects to enhance the feminine quality of the voice"""
        # Create a pedalboard with effects
        board = Pedalboard([
            # Subtle compression to even out dynamics
            Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),
            
            # Phaser for a slightly breathier quality
            Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),
            
            # Filter to enhance higher frequencies
            LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),
            
            # Add a subtle reverb for smoothness
            Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
        ])
        
        # Apply effects
        effected_audio = board(audio, sample_rate)
        return effected_audio
        
    def convert_to_female(self, audio_path, output_path, intensity='medium'):
        """Convert voice from male to female with adjustable intensity"""
        try:
            # Load audio file
            audio, sample_rate = librosa.load(audio_path, sr=None)
            
            # Convert to mono if stereo
            if len(audio.shape) > 1:
                audio = librosa.to_mono(audio)
            
            # Get pitch and formant shift factors based on intensity
            pitch_factor = self.female_pitch_factors[intensity]
            formant_factor = self.female_formant_factors[intensity]
            
            # Extract pitch contour and formants
            pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
            formants = self.extract_formants(audio, sample_rate)
            
            print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
            print(f"Original formants: {formants} Hz")
            
            # Apply time-frequency domain transformation (WORLD or PSOLA would be better,
            # but using a simpler approach for demonstration)
            
            # 1. Apply pitch shifting
            audio_pitched = librosa.effects.pitch_shift(
                audio, 
                sr=sample_rate, 
                n_steps=pitch_factor
            )
            
            # 2. Apply formant shifting using a more sophisticated approach
            # First, split audio into harmonic and percussive components
            harmonic, percussive = librosa.effects.hpss(audio_pitched)
            
            # Apply formant transformation to harmonic component
            n_fft = 2048
            hop_length = 512
            
            # Get spectrogram
            D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)
            
            # Compress/stretch frequency axis to shift formants
            freq_bins = D.shape[0]
            
            # Create a warping matrix for formant shifting
            warp_matrix = np.zeros((freq_bins, freq_bins))
            for i in range(freq_bins):
                target_bin = int(i / formant_factor)
                if target_bin < freq_bins:
                    warp_matrix[i, target_bin] = 1
            
            # Apply the frequency warping
            D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))
            
            # Convert back to time domain
            harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)
            
            # Ensure both components have the same length
            min_len = min(len(harmonic_formant_shifted), len(percussive))
            harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
            percussive = percussive[:min_len]
            
            # Recombine harmonic and percussive parts
            audio_transformed = harmonic_formant_shifted + 0.8 * percussive
            
            # Apply audio effects to enhance feminine qualities
            audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)
            
            # Save the result
            sf.write(output_path, audio_enhanced, sample_rate)
            
            # Extract post-conversion stats for logging
            pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
            formants_after = self.extract_formants(audio_enhanced, sample_rate)
            
            print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
            print(f"Converted formants: {formants_after} Hz")
            
            return output_path
            
        except Exception as e:
            print(f"Error during conversion: {e}")
            import traceback
            traceback.print_exc()
            return None

# Initialize voice converter
voice_converter = AdvancedVoiceConverter()

# Create Gradio interface
def convert_voice(audio_file, intensity):
    """Function to handle the Gradio interface"""
    # Create a temporary file path for the output
    input_filename = os.path.basename(audio_file)
    output_filename = f"female_{input_filename}"
    output_path = os.path.join(os.path.dirname(audio_file), output_filename)
    
    # Perform voice conversion
    result = voice_converter.convert_to_female(audio_file, output_path, intensity)
    
    if result:
        return result
    else:
        return None

# Define the Gradio interface
demo = gr.Interface(
    fn=convert_voice,
    inputs=[
        gr.Audio(type="filepath", label="Upload Voice Audio"),
        gr.Radio(
            ["low", "medium", "high"], 
            label="Feminization Intensity", 
            value="medium",
            info="Choose how much to feminize the voice"
        )
    ],
    outputs=gr.Audio(label="Converted Female Voice"),
    title="Advanced Voice Gender Conversion",
    description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
    examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
    theme=gr.themes.Soft(),
    article="""
    ## How This Works
    
    This application uses several advanced techniques to convert voices to sound more feminine:
    
    1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges
    2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices
    3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities
    4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results
    
    The 'Feminization Intensity' lets you control how dramatic the transformation should be.
    """
)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)