|
import os |
|
import torch |
|
import numpy as np |
|
import gradio as gr |
|
import librosa |
|
import soundfile as sf |
|
from scipy.io import wavfile |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
from scipy import signal |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {device}") |
|
|
|
class VoiceConverter: |
|
def __init__(self): |
|
|
|
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") |
|
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device) |
|
|
|
|
|
self.female_pitch_shift = 2.0 |
|
self.female_formant_shift = 1.2 |
|
|
|
def extract_features(self, audio, sample_rate): |
|
"""Extract audio features using wav2vec2""" |
|
|
|
if sample_rate != 16000: |
|
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
|
sample_rate = 16000 |
|
|
|
|
|
inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device) |
|
with torch.no_grad(): |
|
outputs = self.model(**inputs) |
|
|
|
|
|
hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy() |
|
return hidden_states |
|
|
|
def convert_to_female(self, audio_path, output_path): |
|
"""Convert voice from male to female""" |
|
try: |
|
|
|
audio, sample_rate = librosa.load(audio_path, sr=None) |
|
|
|
|
|
if len(audio.shape) > 1: |
|
audio = librosa.to_mono(audio) |
|
|
|
|
|
pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate) |
|
|
|
|
|
audio_female = librosa.effects.pitch_shift( |
|
audio, |
|
sr=sample_rate, |
|
n_steps=self.female_pitch_shift |
|
) |
|
|
|
|
|
y_stretched = librosa.effects.time_stretch(audio_female, rate=self.female_formant_shift) |
|
audio_female_formant = librosa.resample( |
|
y_stretched, |
|
orig_sr=sample_rate, |
|
target_sr=int(sample_rate * self.female_formant_shift) |
|
) |
|
audio_female_formant = librosa.resample( |
|
audio_female_formant, |
|
orig_sr=int(sample_rate * self.female_formant_shift), |
|
target_sr=sample_rate |
|
) |
|
|
|
|
|
min_len = min(len(audio), len(audio_female_formant)) |
|
audio_female_formant = audio_female_formant[:min_len] |
|
|
|
|
|
sf.write(output_path, audio_female_formant, sample_rate) |
|
return output_path |
|
|
|
except Exception as e: |
|
print(f"Error during conversion: {e}") |
|
return None |
|
|
|
|
|
voice_converter = VoiceConverter() |
|
|
|
|
|
def convert_voice(audio_file): |
|
"""Function to handle the Gradio interface""" |
|
|
|
input_filename = os.path.basename(audio_file) |
|
output_filename = f"female_{input_filename}" |
|
output_path = os.path.join(os.path.dirname(audio_file), output_filename) |
|
|
|
|
|
result = voice_converter.convert_to_female(audio_file, output_path) |
|
|
|
if result: |
|
return result |
|
else: |
|
return None |
|
|
|
|
|
demo = gr.Interface( |
|
fn=convert_voice, |
|
inputs=gr.Audio(type="filepath", label="Upload Male Voice Audio"), |
|
outputs=gr.Audio(label="Converted Female Voice"), |
|
title="Voice Gender Conversion (Male to Female)", |
|
description="Upload an audio file with a male voice to convert it to a female voice using AI.", |
|
examples=[["sample1.wav"], ["sample2.wav"]], |
|
theme=gr.themes.Soft() |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |