Spaces:

Athspi
/

Gggg

Running

App Files Files Community

Athspi commited on Mar 11

Commit

170f3d8

verified ·

1 Parent(s): 5c157a1

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -36

app.py CHANGED Viewed

@@ -4,23 +4,62 @@ import numpy as np
 import gradio as gr
 import librosa
 import soundfile as sf
 from scipy.io import wavfile
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 from scipy import signal
 # Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
-class VoiceConverter:
     def __init__(self):
         # Load wav2vec model for audio feature extraction
         self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
         self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
-        # Parameters for voice conversion
-        self.female_pitch_shift = 2.0  # Shift pitch up for female voice
-        self.female_formant_shift = 1.2  # Adjust formants for female voice
     def extract_features(self, audio, sample_rate):
         """Extract audio features using wav2vec2"""
@@ -38,8 +77,66 @@ class VoiceConverter:
         hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
         return hidden_states
-    def convert_to_female(self, audio_path, output_path):
-        """Convert voice from male to female"""
         try:
             # Load audio file
             audio, sample_rate = librosa.load(audio_path, sr=None)
@@ -48,46 +145,88 @@ class VoiceConverter:
             if len(audio.shape) > 1:
                 audio = librosa.to_mono(audio)
-            # Extract pitch using librosa
-            pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
-            # Apply pitch shifting for female voice
-            audio_female = librosa.effects.pitch_shift(
                 audio,
                 sr=sample_rate,
-                n_steps=self.female_pitch_shift
             )
-            # Apply formant shifting using a simple method - resample and scale back
-            y_stretched = librosa.effects.time_stretch(audio_female, rate=self.female_formant_shift)
-            audio_female_formant = librosa.resample(
-                y_stretched,
-                orig_sr=sample_rate,
-                target_sr=int(sample_rate * self.female_formant_shift)
-            )
-            audio_female_formant = librosa.resample(
-                audio_female_formant,
-                orig_sr=int(sample_rate * self.female_formant_shift),
-                target_sr=sample_rate
-            )
-            # Match the length with the original
-            min_len = min(len(audio), len(audio_female_formant))
-            audio_female_formant = audio_female_formant[:min_len]
             # Save the result
-            sf.write(output_path, audio_female_formant, sample_rate)
             return output_path
         except Exception as e:
             print(f"Error during conversion: {e}")
             return None
 # Initialize voice converter
-voice_converter = VoiceConverter()
 # Create Gradio interface
-def convert_voice(audio_file):
     """Function to handle the Gradio interface"""
     # Create a temporary file path for the output
     input_filename = os.path.basename(audio_file)
@@ -95,7 +234,7 @@ def convert_voice(audio_file):
     output_path = os.path.join(os.path.dirname(audio_file), output_filename)
     # Perform voice conversion
-    result = voice_converter.convert_to_female(audio_file, output_path)
     if result:
         return result
@@ -105,12 +244,32 @@ def convert_voice(audio_file):
 # Define the Gradio interface
 demo = gr.Interface(
     fn=convert_voice,
-    inputs=gr.Audio(type="filepath", label="Upload Male Voice Audio"),
     outputs=gr.Audio(label="Converted Female Voice"),
-    title="Voice Gender Conversion (Male to Female)",
-    description="Upload an audio file with a male voice to convert it to a female voice using AI.",
-    examples=[["sample1.wav"], ["sample2.wav"]],
-    theme=gr.themes.Soft()
 )
 # Launch the app

 import gradio as gr
 import librosa
 import soundfile as sf
+import torch.nn as nn
+import torch.nn.functional as F
 from scipy.io import wavfile
 from scipy import signal
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
+from pedalboard.io import AudioFile
 # Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
+class VoiceEncoder(nn.Module):
+    """Voice embedding network to extract speaker identity features"""
+    def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
+        super(VoiceEncoder, self).__init__()
+        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.linear = nn.Linear(hidden_dim*2, embedding_dim)
+    def forward(self, x):
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x = self.linear(x[:, -1, :])  # Take last timestep
+        x = F.normalize(x, p=2, dim=1)  # L2 normalization
+        return x
+class AdvancedVoiceConverter:
     def __init__(self):
         # Load wav2vec model for audio feature extraction
         self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
         self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
+        # Initialize voice encoder (placeholder - not fully implemented)
+        self.voice_encoder = VoiceEncoder().to(device)
+        # Voice modification parameters
+        self.female_pitch_factors = {
+            'low': 1.5,      # Slight pitch increase
+            'medium': 2.5,   # Moderate pitch increase
+            'high': 3.5      # Significant pitch increase
+        }
+        self.female_formant_factors = {
+            'low': 1.15,
+            'medium': 1.25,
+            'high': 1.35
+        }
+        # Load target female voice statistics (these would normally be learned from data)
+        self.female_stats = {
+            'pitch_mean': 220.0,  # Hz (typical female fundamental frequency)
+            'pitch_std': 30.0,
+            'formant1_mean': 850.0,  # Hz (typical female first formant)
+            'formant2_mean': 2200.0,  # Hz (typical female second formant)
+            'formant3_mean': 3000.0,  # Hz (typical female third formant)
+        }
     def extract_features(self, audio, sample_rate):
         """Extract audio features using wav2vec2"""
         hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
         return hidden_states
+    def extract_pitch_envelope(self, audio, sample_rate):
+        """Extract pitch and envelope information"""
+        # Extract pitch using librosa
+        pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
+        pitch_envelope = []
+        # Find the most dominant pitch at each frame
+        for i in range(pitches.shape[1]):
+            index = magnitudes[:, i].argmax()
+            pitch = pitches[index, i]
+            pitch_envelope.append(pitch if pitch > 0 else 0)
+        return np.array(pitch_envelope)
+    def extract_formants(self, audio, sample_rate, n_formants=3):
+        """Extract formant frequencies using Linear Prediction Coefficients"""
+        # Pre-emphasis to amplify higher frequencies
+        audio_pre = librosa.effects.preemphasis(audio)
+        # Get LPC coefficients
+        order = 2 + sample_rate // 1000  # Rule of thumb for LPC order
+        lpc = librosa.lpc(audio_pre, order=order)
+        # Get roots of the LPC polynomial
+        roots = np.polynomial.polynomial.polyroots(lpc)
+        # Keep only roots with positive imaginary part
+        roots = roots[np.imag(roots) > 0]
+        # Convert to frequencies
+        angles = np.arctan2(np.imag(roots), np.real(roots))
+        formants = angles * (sample_rate / (2 * np.pi))
+        # Sort and return the first n_formants
+        formants = sorted(formants)[:n_formants]
+        return np.array(formants)
+    def apply_voice_effects(self, audio, sample_rate):
+        """Apply audio effects to enhance the feminine quality of the voice"""
+        # Create a pedalboard with effects
+        board = Pedalboard([
+            # Subtle compression to even out dynamics
+            Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),
+            # Phaser for a slightly breathier quality
+            Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),
+            # Filter to enhance higher frequencies
+            LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),
+            # Add a subtle reverb for smoothness
+            Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
+        ])
+        # Apply effects
+        effected_audio = board(audio, sample_rate)
+        return effected_audio
+    def convert_to_female(self, audio_path, output_path, intensity='medium'):
+        """Convert voice from male to female with adjustable intensity"""
         try:
             # Load audio file
             audio, sample_rate = librosa.load(audio_path, sr=None)
             if len(audio.shape) > 1:
                 audio = librosa.to_mono(audio)
+            # Get pitch and formant shift factors based on intensity
+            pitch_factor = self.female_pitch_factors[intensity]
+            formant_factor = self.female_formant_factors[intensity]
+            # Extract pitch contour and formants
+            pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
+            formants = self.extract_formants(audio, sample_rate)
+            print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
+            print(f"Original formants: {formants} Hz")
+            # Apply time-frequency domain transformation (WORLD or PSOLA would be better,
+            # but using a simpler approach for demonstration)
+            # 1. Apply pitch shifting
+            audio_pitched = librosa.effects.pitch_shift(
                 audio,
                 sr=sample_rate,
+                n_steps=pitch_factor
             )
+            # 2. Apply formant shifting using a more sophisticated approach
+            # First, split audio into harmonic and percussive components
+            harmonic, percussive = librosa.effects.hpss(audio_pitched)
+            # Apply formant transformation to harmonic component
+            n_fft = 2048
+            hop_length = 512
+            # Get spectrogram
+            D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)
+            # Compress/stretch frequency axis to shift formants
+            freq_bins = D.shape[0]
+            # Create a warping matrix for formant shifting
+            warp_matrix = np.zeros((freq_bins, freq_bins))
+            for i in range(freq_bins):
+                target_bin = int(i / formant_factor)
+                if target_bin < freq_bins:
+                    warp_matrix[i, target_bin] = 1
+            # Apply the frequency warping
+            D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))
+            # Convert back to time domain
+            harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)
+            # Ensure both components have the same length
+            min_len = min(len(harmonic_formant_shifted), len(percussive))
+            harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
+            percussive = percussive[:min_len]
+            # Recombine harmonic and percussive parts
+            audio_transformed = harmonic_formant_shifted + 0.8 * percussive
+            # Apply audio effects to enhance feminine qualities
+            audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)
             # Save the result
+            sf.write(output_path, audio_enhanced, sample_rate)
+            # Extract post-conversion stats for logging
+            pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
+            formants_after = self.extract_formants(audio_enhanced, sample_rate)
+            print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
+            print(f"Converted formants: {formants_after} Hz")
             return output_path
         except Exception as e:
             print(f"Error during conversion: {e}")
+            import traceback
+            traceback.print_exc()
             return None
 # Initialize voice converter
+voice_converter = AdvancedVoiceConverter()
 # Create Gradio interface
+def convert_voice(audio_file, intensity):
     """Function to handle the Gradio interface"""
     # Create a temporary file path for the output
     input_filename = os.path.basename(audio_file)
     output_path = os.path.join(os.path.dirname(audio_file), output_filename)
     # Perform voice conversion
+    result = voice_converter.convert_to_female(audio_file, output_path, intensity)
     if result:
         return result
 # Define the Gradio interface
 demo = gr.Interface(
     fn=convert_voice,
+    inputs=[
+        gr.Audio(type="filepath", label="Upload Voice Audio"),
+        gr.Radio(
+            ["low", "medium", "high"],
+            label="Feminization Intensity",
+            value="medium",
+            info="Choose how much to feminize the voice"
+        )
+    ],
     outputs=gr.Audio(label="Converted Female Voice"),
+    title="Advanced Voice Gender Conversion",
+    description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
+    examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
+    theme=gr.themes.Soft(),
+    article="""
+    ## How This Works
+    This application uses several advanced techniques to convert voices to sound more feminine:
+    1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges
+    2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices
+    3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities
+    4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results
+    The 'Feminization Intensity' lets you control how dramatic the transformation should be.
+    """
 )
 # Launch the app