|
import os |
|
import torch |
|
import numpy as np |
|
import gradio as gr |
|
import librosa |
|
import soundfile as sf |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from scipy.io import wavfile |
|
from scipy import signal |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb |
|
from pedalboard.io import AudioFile |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {device}") |
|
|
|
class VoiceEncoder(nn.Module): |
|
"""Voice embedding network to extract speaker identity features""" |
|
def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128): |
|
super(VoiceEncoder, self).__init__() |
|
self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True) |
|
self.linear = nn.Linear(hidden_dim*2, embedding_dim) |
|
|
|
def forward(self, x): |
|
self.lstm.flatten_parameters() |
|
x, _ = self.lstm(x) |
|
x = self.linear(x[:, -1, :]) |
|
x = F.normalize(x, p=2, dim=1) |
|
return x |
|
|
|
class AdvancedVoiceConverter: |
|
def __init__(self): |
|
|
|
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") |
|
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device) |
|
|
|
|
|
self.voice_encoder = VoiceEncoder().to(device) |
|
|
|
|
|
self.female_pitch_factors = { |
|
'low': 1.5, |
|
'medium': 2.5, |
|
'high': 3.5 |
|
} |
|
|
|
self.female_formant_factors = { |
|
'low': 1.15, |
|
'medium': 1.25, |
|
'high': 1.35 |
|
} |
|
|
|
|
|
self.female_stats = { |
|
'pitch_mean': 220.0, |
|
'pitch_std': 30.0, |
|
'formant1_mean': 850.0, |
|
'formant2_mean': 2200.0, |
|
'formant3_mean': 3000.0, |
|
} |
|
|
|
def extract_features(self, audio, sample_rate): |
|
"""Extract audio features using wav2vec2""" |
|
|
|
if sample_rate != 16000: |
|
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
|
sample_rate = 16000 |
|
|
|
|
|
inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device) |
|
with torch.no_grad(): |
|
outputs = self.model(**inputs) |
|
|
|
|
|
hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy() |
|
return hidden_states |
|
|
|
def extract_pitch_envelope(self, audio, sample_rate): |
|
"""Extract pitch and envelope information""" |
|
|
|
pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate) |
|
pitch_envelope = [] |
|
|
|
|
|
for i in range(pitches.shape[1]): |
|
index = magnitudes[:, i].argmax() |
|
pitch = pitches[index, i] |
|
pitch_envelope.append(pitch if pitch > 0 else 0) |
|
|
|
return np.array(pitch_envelope) |
|
|
|
def extract_formants(self, audio, sample_rate, n_formants=3): |
|
"""Extract formant frequencies using Linear Prediction Coefficients""" |
|
|
|
audio_pre = librosa.effects.preemphasis(audio) |
|
|
|
|
|
order = 2 + sample_rate // 1000 |
|
lpc = librosa.lpc(audio_pre, order=order) |
|
|
|
|
|
roots = np.polynomial.polynomial.polyroots(lpc) |
|
|
|
|
|
roots = roots[np.imag(roots) > 0] |
|
|
|
|
|
angles = np.arctan2(np.imag(roots), np.real(roots)) |
|
formants = angles * (sample_rate / (2 * np.pi)) |
|
|
|
|
|
formants = sorted(formants)[:n_formants] |
|
return np.array(formants) |
|
|
|
def apply_voice_effects(self, audio, sample_rate): |
|
"""Apply audio effects to enhance the feminine quality of the voice""" |
|
|
|
board = Pedalboard([ |
|
|
|
Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100), |
|
|
|
|
|
Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15), |
|
|
|
|
|
LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1), |
|
|
|
|
|
Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92) |
|
]) |
|
|
|
|
|
effected_audio = board(audio, sample_rate) |
|
return effected_audio |
|
|
|
def convert_to_female(self, audio_path, output_path, intensity='medium'): |
|
"""Convert voice from male to female with adjustable intensity""" |
|
try: |
|
|
|
audio, sample_rate = librosa.load(audio_path, sr=None) |
|
|
|
|
|
if len(audio.shape) > 1: |
|
audio = librosa.to_mono(audio) |
|
|
|
|
|
pitch_factor = self.female_pitch_factors[intensity] |
|
formant_factor = self.female_formant_factors[intensity] |
|
|
|
|
|
pitch_envelope = self.extract_pitch_envelope(audio, sample_rate) |
|
formants = self.extract_formants(audio, sample_rate) |
|
|
|
print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz") |
|
print(f"Original formants: {formants} Hz") |
|
|
|
|
|
|
|
|
|
|
|
audio_pitched = librosa.effects.pitch_shift( |
|
audio, |
|
sr=sample_rate, |
|
n_steps=pitch_factor |
|
) |
|
|
|
|
|
|
|
harmonic, percussive = librosa.effects.hpss(audio_pitched) |
|
|
|
|
|
n_fft = 2048 |
|
hop_length = 512 |
|
|
|
|
|
D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length) |
|
|
|
|
|
freq_bins = D.shape[0] |
|
|
|
|
|
warp_matrix = np.zeros((freq_bins, freq_bins)) |
|
for i in range(freq_bins): |
|
target_bin = int(i / formant_factor) |
|
if target_bin < freq_bins: |
|
warp_matrix[i, target_bin] = 1 |
|
|
|
|
|
D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D)) |
|
|
|
|
|
harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length) |
|
|
|
|
|
min_len = min(len(harmonic_formant_shifted), len(percussive)) |
|
harmonic_formant_shifted = harmonic_formant_shifted[:min_len] |
|
percussive = percussive[:min_len] |
|
|
|
|
|
audio_transformed = harmonic_formant_shifted + 0.8 * percussive |
|
|
|
|
|
audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate) |
|
|
|
|
|
sf.write(output_path, audio_enhanced, sample_rate) |
|
|
|
|
|
pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate) |
|
formants_after = self.extract_formants(audio_enhanced, sample_rate) |
|
|
|
print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz") |
|
print(f"Converted formants: {formants_after} Hz") |
|
|
|
return output_path |
|
|
|
except Exception as e: |
|
print(f"Error during conversion: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
return None |
|
|
|
|
|
voice_converter = AdvancedVoiceConverter() |
|
|
|
|
|
def convert_voice(audio_file, intensity): |
|
"""Function to handle the Gradio interface""" |
|
|
|
input_filename = os.path.basename(audio_file) |
|
output_filename = f"female_{input_filename}" |
|
output_path = os.path.join(os.path.dirname(audio_file), output_filename) |
|
|
|
|
|
result = voice_converter.convert_to_female(audio_file, output_path, intensity) |
|
|
|
if result: |
|
return result |
|
else: |
|
return None |
|
|
|
|
|
demo = gr.Interface( |
|
fn=convert_voice, |
|
inputs=[ |
|
gr.Audio(type="filepath", label="Upload Voice Audio"), |
|
gr.Radio( |
|
["low", "medium", "high"], |
|
label="Feminization Intensity", |
|
value="medium", |
|
info="Choose how much to feminize the voice" |
|
) |
|
], |
|
outputs=gr.Audio(label="Converted Female Voice"), |
|
title="Advanced Voice Gender Conversion", |
|
description="Upload an audio file to convert it to a female voice using advanced AI techniques.", |
|
examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]], |
|
theme=gr.themes.Soft(), |
|
article=""" |
|
## How This Works |
|
|
|
This application uses several advanced techniques to convert voices to sound more feminine: |
|
|
|
1. **Pitch Shifting**: Raises the fundamental frequency to typical female ranges |
|
2. **Formant Modification**: Adjusts the vocal tract resonances that differentiate male and female voices |
|
3. **Voice Effects**: Applies subtle audio effects to enhance feminine voice qualities |
|
4. **Spectral Transformation**: Uses frequency warping to achieve more natural-sounding results |
|
|
|
The 'Feminization Intensity' lets you control how dramatic the transformation should be. |
|
""" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |