File size: 11,301 Bytes
cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 170f3d8 cab5167 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
import os
import torch
import numpy as np
import gradio as gr
import librosa
import soundfile as sf
import torch.nn as nn
import torch.nn.functional as F
from scipy.io import wavfile
from scipy import signal
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
from pedalboard.io import AudioFile
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
class VoiceEncoder(nn.Module):
"""Voice embedding network to extract speaker identity features"""
def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
super(VoiceEncoder, self).__init__()
self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
self.linear = nn.Linear(hidden_dim*2, embedding_dim)
def forward(self, x):
self.lstm.flatten_parameters()
x, _ = self.lstm(x)
x = self.linear(x[:, -1, :]) # Take last timestep
x = F.normalize(x, p=2, dim=1) # L2 normalization
return x
class AdvancedVoiceConverter:
def __init__(self):
# Load wav2vec model for audio feature extraction
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
# Initialize voice encoder (placeholder - not fully implemented)
self.voice_encoder = VoiceEncoder().to(device)
# Voice modification parameters
self.female_pitch_factors = {
'low': 1.5, # Slight pitch increase
'medium': 2.5, # Moderate pitch increase
'high': 3.5 # Significant pitch increase
}
self.female_formant_factors = {
'low': 1.15,
'medium': 1.25,
'high': 1.35
}
# Load target female voice statistics (these would normally be learned from data)
self.female_stats = {
'pitch_mean': 220.0, # Hz (typical female fundamental frequency)
'pitch_std': 30.0,
'formant1_mean': 850.0, # Hz (typical female first formant)
'formant2_mean': 2200.0, # Hz (typical female second formant)
'formant3_mean': 3000.0, # Hz (typical female third formant)
}
def extract_features(self, audio, sample_rate):
"""Extract audio features using wav2vec2"""
# Resample if needed
if sample_rate != 16000:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
# Extract features
inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
with torch.no_grad():
outputs = self.model(**inputs)
# Get hidden states (features)
hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
return hidden_states
def extract_pitch_envelope(self, audio, sample_rate):
"""Extract pitch and envelope information"""
# Extract pitch using librosa
pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
pitch_envelope = []
# Find the most dominant pitch at each frame
for i in range(pitches.shape[1]):
index = magnitudes[:, i].argmax()
pitch = pitches[index, i]
pitch_envelope.append(pitch if pitch > 0 else 0)
return np.array(pitch_envelope)
def extract_formants(self, audio, sample_rate, n_formants=3):
"""Extract formant frequencies using Linear Prediction Coefficients"""
# Pre-emphasis to amplify higher frequencies
audio_pre = librosa.effects.preemphasis(audio)
# Get LPC coefficients
order = 2 + sample_rate // 1000 # Rule of thumb for LPC order
lpc = librosa.lpc(audio_pre, order=order)
# Get roots of the LPC polynomial
roots = np.polynomial.polynomial.polyroots(lpc)
# Keep only roots with positive imaginary part
roots = roots[np.imag(roots) > 0]
# Convert to frequencies
angles = np.arctan2(np.imag(roots), np.real(roots))
formants = angles * (sample_rate / (2 * np.pi))
# Sort and return the first n_formants
formants = sorted(formants)[:n_formants]
return np.array(formants)
def apply_voice_effects(self, audio, sample_rate):
"""Apply audio effects to enhance the feminine quality of the voice"""
# Create a pedalboard with effects
board = Pedalboard([
# Subtle compression to even out dynamics
Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),
# Phaser for a slightly breathier quality
Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),
# Filter to enhance higher frequencies
LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),
# Add a subtle reverb for smoothness
Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
])
# Apply effects
effected_audio = board(audio, sample_rate)
return effected_audio
def convert_to_female(self, audio_path, output_path, intensity='medium'):
"""Convert voice from male to female with adjustable intensity"""
try:
# Load audio file
audio, sample_rate = librosa.load(audio_path, sr=None)
# Convert to mono if stereo
if len(audio.shape) > 1:
audio = librosa.to_mono(audio)
# Get pitch and formant shift factors based on intensity
pitch_factor = self.female_pitch_factors[intensity]
formant_factor = self.female_formant_factors[intensity]
# Extract pitch contour and formants
pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
formants = self.extract_formants(audio, sample_rate)
print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
print(f"Original formants: {formants} Hz")
# Apply time-frequency domain transformation (WORLD or PSOLA would be better,
# but using a simpler approach for demonstration)
# 1. Apply pitch shifting
audio_pitched = librosa.effects.pitch_shift(
audio,
sr=sample_rate,
n_steps=pitch_factor
)
# 2. Apply formant shifting using a more sophisticated approach
# First, split audio into harmonic and percussive components
harmonic, percussive = librosa.effects.hpss(audio_pitched)
# Apply formant transformation to harmonic component
n_fft = 2048
hop_length = 512
# Get spectrogram
D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)
# Compress/stretch frequency axis to shift formants
freq_bins = D.shape[0]
# Create a warping matrix for formant shifting
warp_matrix = np.zeros((freq_bins, freq_bins))
for i in range(freq_bins):
target_bin = int(i / formant_factor)
if target_bin < freq_bins:
warp_matrix[i, target_bin] = 1
# Apply the frequency warping
D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))
# Convert back to time domain
harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)
# Ensure both components have the same length
min_len = min(len(harmonic_formant_shifted), len(percussive))
harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
percussive = percussive[:min_len]
# Recombine harmonic and percussive parts
audio_transformed = harmonic_formant_shifted + 0.8 * percussive
# Apply audio effects to enhance feminine qualities
audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)
# Save the result
sf.write(output_path, audio_enhanced, sample_rate)
# Extract post-conversion stats for logging
pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
formants_after = self.extract_formants(audio_enhanced, sample_rate)
print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
print(f"Converted formants: {formants_after} Hz")
return output_path
except Exception as e:
print(f"Error during conversion: {e}")
import traceback
traceback.print_exc()
return None
# Initialize voice converter
voice_converter = AdvancedVoiceConverter()
# Create Gradio interface
def convert_voice(audio_file, intensity):
"""Function to handle the Gradio interface"""
# Create a temporary file path for the output
input_filename = os.path.basename(audio_file)
output_filename = f"female_{input_filename}"
output_path = os.path.join(os.path.dirname(audio_file), output_filename)
# Perform voice conversion
result = voice_converter.convert_to_female(audio_file, output_path, intensity)
if result:
return result
else:
return None
# Define the Gradio interface
demo = gr.Interface(
fn=convert_voice,
inputs=[
gr.Audio(type="filepath", label="Upload Voice Audio"),
gr.Radio(
["low", "medium", "high"],
label="Feminization Intensity",
value="medium",
info="Choose how much to feminize the voice"
)
],
outputs=gr.Audio(label="Converted Female Voice"),
title="Advanced Voice Gender Conversion",
description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
theme=gr.themes.Soft(),
article="""
## How This Works
This application uses several advanced techniques to convert voices to sound more feminine:
1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges
2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices
3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities
4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results
The 'Feminization Intensity' lets you control how dramatic the transformation should be.
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True) |