import os import torch import numpy as np import gradio as gr import librosa import soundfile as sf from scipy.io import wavfile from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from scipy import signal # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") class VoiceConverter: def __init__(self): # Load wav2vec model for audio feature extraction self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device) # Parameters for voice conversion self.female_pitch_shift = 2.0 # Shift pitch up for female voice self.female_formant_shift = 1.2 # Adjust formants for female voice def extract_features(self, audio, sample_rate): """Extract audio features using wav2vec2""" # Resample if needed if sample_rate != 16000: audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) sample_rate = 16000 # Extract features inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device) with torch.no_grad(): outputs = self.model(**inputs) # Get hidden states (features) hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy() return hidden_states def convert_to_female(self, audio_path, output_path): """Convert voice from male to female""" try: # Load audio file audio, sample_rate = librosa.load(audio_path, sr=None) # Convert to mono if stereo if len(audio.shape) > 1: audio = librosa.to_mono(audio) # Extract pitch using librosa pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate) # Apply pitch shifting for female voice audio_female = librosa.effects.pitch_shift( audio, sr=sample_rate, n_steps=self.female_pitch_shift ) # Apply formant shifting using a simple method - resample and scale back y_stretched = librosa.effects.time_stretch(audio_female, rate=self.female_formant_shift) audio_female_formant = librosa.resample( y_stretched, orig_sr=sample_rate, target_sr=int(sample_rate * self.female_formant_shift) ) audio_female_formant = librosa.resample( audio_female_formant, orig_sr=int(sample_rate * self.female_formant_shift), target_sr=sample_rate ) # Match the length with the original min_len = min(len(audio), len(audio_female_formant)) audio_female_formant = audio_female_formant[:min_len] # Save the result sf.write(output_path, audio_female_formant, sample_rate) return output_path except Exception as e: print(f"Error during conversion: {e}") return None # Initialize voice converter voice_converter = VoiceConverter() # Create Gradio interface def convert_voice(audio_file): """Function to handle the Gradio interface""" # Create a temporary file path for the output input_filename = os.path.basename(audio_file) output_filename = f"female_{input_filename}" output_path = os.path.join(os.path.dirname(audio_file), output_filename) # Perform voice conversion result = voice_converter.convert_to_female(audio_file, output_path) if result: return result else: return None # Define the Gradio interface demo = gr.Interface( fn=convert_voice, inputs=gr.Audio(type="filepath", label="Upload Male Voice Audio"), outputs=gr.Audio(label="Converted Female Voice"), title="Voice Gender Conversion (Male to Female)", description="Upload an audio file with a male voice to convert it to a female voice using AI.", examples=[["sample1.wav"], ["sample2.wav"]], theme=gr.themes.Soft() ) # Launch the app if __name__ == "__main__": demo.launch(share=True)