Spaces:

Athspi
/

Gggg

Running

App Files Files Community

Gggg / app.py

Athspi

Create app.py

cab5167 verified 4 months ago

raw

history blame

4.35 kB

	import os
	import torch
	import numpy as np
	import gradio as gr
	import librosa
	import soundfile as sf
	from scipy.io import wavfile
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	from scipy import signal

	# Set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	class VoiceConverter:
	def __init__(self):
	# Load wav2vec model for audio feature extraction
	self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
	self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)

	# Parameters for voice conversion
	self.female_pitch_shift = 2.0 # Shift pitch up for female voice
	self.female_formant_shift = 1.2 # Adjust formants for female voice

	def extract_features(self, audio, sample_rate):
	"""Extract audio features using wav2vec2"""
	# Resample if needed
	if sample_rate != 16000:
	audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
	sample_rate = 16000

	# Extract features
	inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = self.model(**inputs)

	# Get hidden states (features)
	hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
	return hidden_states

	def convert_to_female(self, audio_path, output_path):
	"""Convert voice from male to female"""
	try:
	# Load audio file
	audio, sample_rate = librosa.load(audio_path, sr=None)

	# Convert to mono if stereo
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio)

	# Extract pitch using librosa
	pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)

	# Apply pitch shifting for female voice
	audio_female = librosa.effects.pitch_shift(
	audio,
	sr=sample_rate,
	n_steps=self.female_pitch_shift
	)

	# Apply formant shifting using a simple method - resample and scale back
	y_stretched = librosa.effects.time_stretch(audio_female, rate=self.female_formant_shift)
	audio_female_formant = librosa.resample(
	y_stretched,
	orig_sr=sample_rate,
	target_sr=int(sample_rate * self.female_formant_shift)
	)
	audio_female_formant = librosa.resample(
	audio_female_formant,
	orig_sr=int(sample_rate * self.female_formant_shift),
	target_sr=sample_rate
	)

	# Match the length with the original
	min_len = min(len(audio), len(audio_female_formant))
	audio_female_formant = audio_female_formant[:min_len]

	# Save the result
	sf.write(output_path, audio_female_formant, sample_rate)
	return output_path

	except Exception as e:
	print(f"Error during conversion: {e}")
	return None

	# Initialize voice converter
	voice_converter = VoiceConverter()

	# Create Gradio interface
	def convert_voice(audio_file):
	"""Function to handle the Gradio interface"""
	# Create a temporary file path for the output
	input_filename = os.path.basename(audio_file)
	output_filename = f"female_{input_filename}"
	output_path = os.path.join(os.path.dirname(audio_file), output_filename)

	# Perform voice conversion
	result = voice_converter.convert_to_female(audio_file, output_path)

	if result:
	return result
	else:
	return None

	# Define the Gradio interface
	demo = gr.Interface(
	fn=convert_voice,
	inputs=gr.Audio(type="filepath", label="Upload Male Voice Audio"),
	outputs=gr.Audio(label="Converted Female Voice"),
	title="Voice Gender Conversion (Male to Female)",
	description="Upload an audio file with a male voice to convert it to a female voice using AI.",
	examples=[["sample1.wav"], ["sample2.wav"]],
	theme=gr.themes.Soft()
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(share=True)