Spaces:

Athspi
/

Gggg

Sleeping

App Files Files Community

Gggg / app.py

Athspi

Update app.py

170f3d8 verified 4 months ago

raw

history blame contribute delete

11.3 kB

	import os
	import torch
	import numpy as np
	import gradio as gr
	import librosa
	import soundfile as sf
	import torch.nn as nn
	import torch.nn.functional as F
	from scipy.io import wavfile
	from scipy import signal
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	from pedalboard import Pedalboard, Compressor, LadderFilter, Phaser, Reverb
	from pedalboard.io import AudioFile

	# Set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	class VoiceEncoder(nn.Module):
	"""Voice embedding network to extract speaker identity features"""
	def __init__(self, input_dim=512, hidden_dim=256, embedding_dim=128):
	super(VoiceEncoder, self).__init__()
	self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
	self.linear = nn.Linear(hidden_dim*2, embedding_dim)

	def forward(self, x):
	self.lstm.flatten_parameters()
	x, _ = self.lstm(x)
	x = self.linear(x[:, -1, :]) # Take last timestep
	x = F.normalize(x, p=2, dim=1) # L2 normalization
	return x

	class AdvancedVoiceConverter:
	def __init__(self):
	# Load wav2vec model for audio feature extraction
	self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
	self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)

	# Initialize voice encoder (placeholder - not fully implemented)
	self.voice_encoder = VoiceEncoder().to(device)

	# Voice modification parameters
	self.female_pitch_factors = {
	'low': 1.5, # Slight pitch increase
	'medium': 2.5, # Moderate pitch increase
	'high': 3.5 # Significant pitch increase
	}

	self.female_formant_factors = {
	'low': 1.15,
	'medium': 1.25,
	'high': 1.35
	}

	# Load target female voice statistics (these would normally be learned from data)
	self.female_stats = {
	'pitch_mean': 220.0, # Hz (typical female fundamental frequency)
	'pitch_std': 30.0,
	'formant1_mean': 850.0, # Hz (typical female first formant)
	'formant2_mean': 2200.0, # Hz (typical female second formant)
	'formant3_mean': 3000.0, # Hz (typical female third formant)
	}

	def extract_features(self, audio, sample_rate):
	"""Extract audio features using wav2vec2"""
	# Resample if needed
	if sample_rate != 16000:
	audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
	sample_rate = 16000

	# Extract features
	inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = self.model(**inputs)

	# Get hidden states (features)
	hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
	return hidden_states

	def extract_pitch_envelope(self, audio, sample_rate):
	"""Extract pitch and envelope information"""
	# Extract pitch using librosa
	pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
	pitch_envelope = []

	# Find the most dominant pitch at each frame
	for i in range(pitches.shape[1]):
	index = magnitudes[:, i].argmax()
	pitch = pitches[index, i]
	pitch_envelope.append(pitch if pitch > 0 else 0)

	return np.array(pitch_envelope)

	def extract_formants(self, audio, sample_rate, n_formants=3):
	"""Extract formant frequencies using Linear Prediction Coefficients"""
	# Pre-emphasis to amplify higher frequencies
	audio_pre = librosa.effects.preemphasis(audio)

	# Get LPC coefficients
	order = 2 + sample_rate // 1000 # Rule of thumb for LPC order
	lpc = librosa.lpc(audio_pre, order=order)

	# Get roots of the LPC polynomial
	roots = np.polynomial.polynomial.polyroots(lpc)

	# Keep only roots with positive imaginary part
	roots = roots[np.imag(roots) > 0]

	# Convert to frequencies
	angles = np.arctan2(np.imag(roots), np.real(roots))
	formants = angles * (sample_rate / (2 * np.pi))

	# Sort and return the first n_formants
	formants = sorted(formants)[:n_formants]
	return np.array(formants)

	def apply_voice_effects(self, audio, sample_rate):
	"""Apply audio effects to enhance the feminine quality of the voice"""
	# Create a pedalboard with effects
	board = Pedalboard([
	# Subtle compression to even out dynamics
	Compressor(threshold_db=-20, ratio=2.5, attack_ms=5, release_ms=100),

	# Phaser for a slightly breathier quality
	Phaser(rate_hz=0.7, depth=0.3, feedback=0.2, mix=0.15),

	# Filter to enhance higher frequencies
	LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=300, resonance=0.1),

	# Add a subtle reverb for smoothness
	Reverb(room_size=0.12, damping=0.6, wet_level=0.08, dry_level=0.92)
	])

	# Apply effects
	effected_audio = board(audio, sample_rate)
	return effected_audio

	def convert_to_female(self, audio_path, output_path, intensity='medium'):
	"""Convert voice from male to female with adjustable intensity"""
	try:
	# Load audio file
	audio, sample_rate = librosa.load(audio_path, sr=None)

	# Convert to mono if stereo
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio)

	# Get pitch and formant shift factors based on intensity
	pitch_factor = self.female_pitch_factors[intensity]
	formant_factor = self.female_formant_factors[intensity]

	# Extract pitch contour and formants
	pitch_envelope = self.extract_pitch_envelope(audio, sample_rate)
	formants = self.extract_formants(audio, sample_rate)

	print(f"Original average pitch: {np.mean(pitch_envelope[pitch_envelope > 0])} Hz")
	print(f"Original formants: {formants} Hz")

	# Apply time-frequency domain transformation (WORLD or PSOLA would be better,
	# but using a simpler approach for demonstration)

	# 1. Apply pitch shifting
	audio_pitched = librosa.effects.pitch_shift(
	audio,
	sr=sample_rate,
	n_steps=pitch_factor
	)

	# 2. Apply formant shifting using a more sophisticated approach
	# First, split audio into harmonic and percussive components
	harmonic, percussive = librosa.effects.hpss(audio_pitched)

	# Apply formant transformation to harmonic component
	n_fft = 2048
	hop_length = 512

	# Get spectrogram
	D = librosa.stft(harmonic, n_fft=n_fft, hop_length=hop_length)

	# Compress/stretch frequency axis to shift formants
	freq_bins = D.shape[0]

	# Create a warping matrix for formant shifting
	warp_matrix = np.zeros((freq_bins, freq_bins))
	for i in range(freq_bins):
	target_bin = int(i / formant_factor)
	if target_bin < freq_bins:
	warp_matrix[i, target_bin] = 1

	# Apply the frequency warping
	D_warped = np.dot(warp_matrix, np.abs(D)) * np.exp(1j * np.angle(D))

	# Convert back to time domain
	harmonic_formant_shifted = librosa.istft(D_warped, hop_length=hop_length)

	# Ensure both components have the same length
	min_len = min(len(harmonic_formant_shifted), len(percussive))
	harmonic_formant_shifted = harmonic_formant_shifted[:min_len]
	percussive = percussive[:min_len]

	# Recombine harmonic and percussive parts
	audio_transformed = harmonic_formant_shifted + 0.8 * percussive

	# Apply audio effects to enhance feminine qualities
	audio_enhanced = self.apply_voice_effects(audio_transformed, sample_rate)

	# Save the result
	sf.write(output_path, audio_enhanced, sample_rate)

	# Extract post-conversion stats for logging
	pitch_envelope_after = self.extract_pitch_envelope(audio_enhanced, sample_rate)
	formants_after = self.extract_formants(audio_enhanced, sample_rate)

	print(f"Converted average pitch: {np.mean(pitch_envelope_after[pitch_envelope_after > 0])} Hz")
	print(f"Converted formants: {formants_after} Hz")

	return output_path

	except Exception as e:
	print(f"Error during conversion: {e}")
	import traceback
	traceback.print_exc()
	return None

	# Initialize voice converter
	voice_converter = AdvancedVoiceConverter()

	# Create Gradio interface
	def convert_voice(audio_file, intensity):
	"""Function to handle the Gradio interface"""
	# Create a temporary file path for the output
	input_filename = os.path.basename(audio_file)
	output_filename = f"female_{input_filename}"
	output_path = os.path.join(os.path.dirname(audio_file), output_filename)

	# Perform voice conversion
	result = voice_converter.convert_to_female(audio_file, output_path, intensity)

	if result:
	return result
	else:
	return None

	# Define the Gradio interface
	demo = gr.Interface(
	fn=convert_voice,
	inputs=[
	gr.Audio(type="filepath", label="Upload Voice Audio"),
	gr.Radio(
	["low", "medium", "high"],
	label="Feminization Intensity",
	value="medium",
	info="Choose how much to feminize the voice"
	)
	],
	outputs=gr.Audio(label="Converted Female Voice"),
	title="Advanced Voice Gender Conversion",
	description="Upload an audio file to convert it to a female voice using advanced AI techniques.",
	examples=[["sample1.wav", "medium"], ["sample2.wav", "high"]],
	theme=gr.themes.Soft(),
	article="""
	## How This Works

	This application uses several advanced techniques to convert voices to sound more feminine:

	1. Pitch Shifting: Raises the fundamental frequency to typical female ranges
	2. Formant Modification: Adjusts the vocal tract resonances that differentiate male and female voices
	3. Voice Effects: Applies subtle audio effects to enhance feminine voice qualities
	4. Spectral Transformation: Uses frequency warping to achieve more natural-sounding results

	The 'Feminization Intensity' lets you control how dramatic the transformation should be.
	"""
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(share=True)