Gggg / app.py
Athspi's picture
Update app.py
170f3d8 verified
import os
import torch
import numpy as np
import gradio as gr
import librosa
import soundfile as sf
import torch.nn as nn
import torch.nn.functional as F
from scipy.io import wavfile
from scipy import signal
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
from pedalboard.io import AudioFile
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
class VoiceEncoder(nn.Module):
"""Voice embedding network to extract speaker identity features"""
def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
super(VoiceEncoder, self).__init__()
self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
self.linear = nn.Linear(hidden_dim*2, embedding_dim)
def forward(self, x):
self.lstm.flatten_parameters()
x, _ = self.lstm(x)
x = self.linear(x[:, -1, :]) # Take last timestep
x = F.normalize(x, p=2, dim=1) # L2 normalization
return x
class AdvancedVoiceConverter:
def __init__(self):
# Load wav2vec model for audio feature extraction
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
# Initialize voice encoder (placeholder - not fully implemented)
self.voice_encoder = VoiceEncoder().to(device)
# Voice modification parameters
self.female_pitch_factors = {
'low': 1.5, # Slight pitch increase
'medium': 2.5, # Moderate pitch increase
'high': 3.5 # Significant pitch increase
}
self.female_formant_factors = {
'low': 1.15,
'medium': 1.25,
'high': 1.35
}
# Load target female voice statistics (these would normally be learned from data)
self.female_stats = {
'pitch_mean': 220.0, # Hz (typical female fundamental frequency)
'pitch_std': 30.0,
'formant1_mean': 850.0, # Hz (typical female first formant)
'formant2_mean': 2200.0, # Hz (typical female second formant)
'formant3_mean': 3000.0, # Hz (typical female third formant)
}
def extract_features(self, audio, sample_rate):
"""Extract audio features using wav2vec2"""
# Resample if needed
if sample_rate != 16000:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
# Extract features
inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
with torch.no_grad():
outputs = self.model(**inputs)
# Get hidden states (features)
hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
return hidden_states
def extract_pitch_envelope(self, audio, sample_rate):
"""Extract pitch and envelope information"""
# Extract pitch using librosa
pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
pitch_envelope = []
# Find the most dominant pitch at each frame
for i in range(pitches.shape[1]):
index = magnitudes[:, i].argmax()
pitch = pitches[index, i]
pitch_envelope.append(pitch if pitch > 0 else 0)
return np.array(pitch_envelope)
def extract_formants(self, audio, sample_rate, n_formants=3):
"""Extract formant frequencies using Linear Prediction Coefficients"""
# Pre-emphasis to amplify higher frequencies
audio_pre = librosa.effects.preemphasis(audio)
# Get LPC coefficients
order = 2 + sample_rate // 1000 # Rule of thumb for LPC order
lpc = librosa.lpc(audio_pre, order=order)
# Get roots of the LPC polynomial
roots = np.polynomial.polynomial.polyroots(lpc)
# Keep only roots with positive imaginary part
roots = roots[np.imag(roots) > 0]
# Convert to frequencies
angles = np.arctan2(np.imag(roots), np.real(roots))
formants = angles * (sample_rate / (2 * np.pi))
# Sort and return the first n_formants
formants = sorted(formants)[:n_formants]
return np.array(formants)
def apply_voice_effects(self, audio, sample_rate):
"""Apply audio effects to enhance the feminine quality of the voice"""
# Create a pedalboard with effects
board = Pedalboard([
# Subtle compression to even out dynamics
Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),
# Phaser for a slightly breathier quality
Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),
# Filter to enhance higher frequencies
LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),
# Add a subtle reverb for smoothness
Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
])
# Apply effects
effected_audio = board(audio, sample_rate)
return effected_audio
def convert_to_female(self, audio_path, output_path, intensity='medium'):
"""Convert voice from male to female with adjustable intensity"""
try:
# Load audio file
audio, sample_rate = librosa.load(audio_path, sr=None)
# Convert to mono if stereo
if len(audio.shape) > 1:
audio = librosa.to_mono(audio)
# Get pitch and formant shift factors based on intensity
pitch_factor = self.female_pitch_factors[intensity]
formant_factor = self.female_formant_factors[intensity]
# Extract pitch contour and formants
pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
formants = self.extract_formants(audio, sample_rate)
print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
print(f"Original formants: {formants} Hz")
# Apply time-frequency domain transformation (WORLD or PSOLA would be better,
# but using a simpler approach for demonstration)
# 1. Apply pitch shifting
audio_pitched = librosa.effects.pitch_shift(
audio,
sr=sample_rate,
n_steps=pitch_factor
)
# 2. Apply formant shifting using a more sophisticated approach
# First, split audio into harmonic and percussive components
harmonic, percussive = librosa.effects.hpss(audio_pitched)
# Apply formant transformation to harmonic component
n_fft = 2048
hop_length = 512
# Get spectrogram
D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)
# Compress/stretch frequency axis to shift formants
freq_bins = D.shape[0]
# Create a warping matrix for formant shifting
warp_matrix = np.zeros((freq_bins, freq_bins))
for i in range(freq_bins):
target_bin = int(i / formant_factor)
if target_bin < freq_bins:
warp_matrix[i, target_bin] = 1
# Apply the frequency warping
D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))
# Convert back to time domain
harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)
# Ensure both components have the same length
min_len = min(len(harmonic_formant_shifted), len(percussive))
harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
percussive = percussive[:min_len]
# Recombine harmonic and percussive parts
audio_transformed = harmonic_formant_shifted + 0.8 * percussive
# Apply audio effects to enhance feminine qualities
audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)
# Save the result
sf.write(output_path, audio_enhanced, sample_rate)
# Extract post-conversion stats for logging
pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
formants_after = self.extract_formants(audio_enhanced, sample_rate)
print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
print(f"Converted formants: {formants_after} Hz")
return output_path
except Exception as e:
print(f"Error during conversion: {e}")
import traceback
traceback.print_exc()
return None
# Initialize voice converter
voice_converter = AdvancedVoiceConverter()
# Create Gradio interface
def convert_voice(audio_file, intensity):
"""Function to handle the Gradio interface"""
# Create a temporary file path for the output
input_filename = os.path.basename(audio_file)
output_filename = f"female_{input_filename}"
output_path = os.path.join(os.path.dirname(audio_file), output_filename)
# Perform voice conversion
result = voice_converter.convert_to_female(audio_file, output_path, intensity)
if result:
return result
else:
return None
# Define the Gradio interface
demo = gr.Interface(
fn=convert_voice,
inputs=[
gr.Audio(type="filepath", label="Upload Voice Audio"),
gr.Radio(
["low", "medium", "high"],
label="Feminization Intensity",
value="medium",
info="Choose how much to feminize the voice"
)
],
outputs=gr.Audio(label="Converted Female Voice"),
title="Advanced Voice Gender Conversion",
description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
theme=gr.themes.Soft(),
article="""
## How This Works
This application uses several advanced techniques to convert voices to sound more feminine:
1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges
2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices
3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities
4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results
The 'Feminization Intensity' lets you control how dramatic the transformation should be.
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True)