File size: 4,354 Bytes
cab5167 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import torch
import numpy as np
import gradio as gr
import librosa
import soundfile as sf
from scipy.io import wavfile
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from scipy import signal
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
class VoiceConverter:
def __init__(self):
# Load wav2vec model for audio feature extraction
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
# Parameters for voice conversion
self.female_pitch_shift = 2.0 # Shift pitch up for female voice
self.female_formant_shift = 1.2 # Adjust formants for female voice
def extract_features(self, audio, sample_rate):
"""Extract audio features using wav2vec2"""
# Resample if needed
if sample_rate != 16000:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
# Extract features
inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
with torch.no_grad():
outputs = self.model(**inputs)
# Get hidden states (features)
hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
return hidden_states
def convert_to_female(self, audio_path, output_path):
"""Convert voice from male to female"""
try:
# Load audio file
audio, sample_rate = librosa.load(audio_path, sr=None)
# Convert to mono if stereo
if len(audio.shape) > 1:
audio = librosa.to_mono(audio)
# Extract pitch using librosa
pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
# Apply pitch shifting for female voice
audio_female = librosa.effects.pitch_shift(
audio,
sr=sample_rate,
n_steps=self.female_pitch_shift
)
# Apply formant shifting using a simple method - resample and scale back
y_stretched = librosa.effects.time_stretch(audio_female, rate=self.female_formant_shift)
audio_female_formant = librosa.resample(
y_stretched,
orig_sr=sample_rate,
target_sr=int(sample_rate * self.female_formant_shift)
)
audio_female_formant = librosa.resample(
audio_female_formant,
orig_sr=int(sample_rate * self.female_formant_shift),
target_sr=sample_rate
)
# Match the length with the original
min_len = min(len(audio), len(audio_female_formant))
audio_female_formant = audio_female_formant[:min_len]
# Save the result
sf.write(output_path, audio_female_formant, sample_rate)
return output_path
except Exception as e:
print(f"Error during conversion: {e}")
return None
# Initialize voice converter
voice_converter = VoiceConverter()
# Create Gradio interface
def convert_voice(audio_file):
"""Function to handle the Gradio interface"""
# Create a temporary file path for the output
input_filename = os.path.basename(audio_file)
output_filename = f"female_{input_filename}"
output_path = os.path.join(os.path.dirname(audio_file), output_filename)
# Perform voice conversion
result = voice_converter.convert_to_female(audio_file, output_path)
if result:
return result
else:
return None
# Define the Gradio interface
demo = gr.Interface(
fn=convert_voice,
inputs=gr.Audio(type="filepath", label="Upload Male Voice Audio"),
outputs=gr.Audio(label="Converted Female Voice"),
title="Voice Gender Conversion (Male to Female)",
description="Upload an audio file with a male voice to convert it to a female voice using AI.",
examples=[["sample1.wav"], ["sample2.wav"]],
theme=gr.themes.Soft()
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True) |