File size: 4,354 Bytes
cab5167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import torch
import numpy as np
import gradio as gr
import librosa
import soundfile as sf
from scipy.io import wavfile
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from scipy import signal

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class VoiceConverter:
    def __init__(self):
        # Load wav2vec model for audio feature extraction
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
        
        # Parameters for voice conversion
        self.female_pitch_shift = 2.0  # Shift pitch up for female voice
        self.female_formant_shift = 1.2  # Adjust formants for female voice
        
    def extract_features(self, audio, sample_rate):
        """Extract audio features using wav2vec2"""
        # Resample if needed
        if sample_rate != 16000:
            audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
            sample_rate = 16000
            
        # Extract features
        inputs = self.processor(audio, sampling_rate=sample_rate, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Get hidden states (features)
        hidden_states = outputs.hidden_states[-1].squeeze().cpu().numpy()
        return hidden_states
    
    def convert_to_female(self, audio_path, output_path):
        """Convert voice from male to female"""
        try:
            # Load audio file
            audio, sample_rate = librosa.load(audio_path, sr=None)
            
            # Convert to mono if stereo
            if len(audio.shape) > 1:
                audio = librosa.to_mono(audio)
            
            # Extract pitch using librosa
            pitches, magnitudes = librosa.piptrack(y=audio, sr=sample_rate)
            
            # Apply pitch shifting for female voice
            audio_female = librosa.effects.pitch_shift(
                audio, 
                sr=sample_rate, 
                n_steps=self.female_pitch_shift
            )
            
            # Apply formant shifting using a simple method - resample and scale back
            y_stretched = librosa.effects.time_stretch(audio_female, rate=self.female_formant_shift)
            audio_female_formant = librosa.resample(
                y_stretched, 
                orig_sr=sample_rate, 
                target_sr=int(sample_rate * self.female_formant_shift)
            )
            audio_female_formant = librosa.resample(
                audio_female_formant,
                orig_sr=int(sample_rate * self.female_formant_shift),
                target_sr=sample_rate
            )
            
            # Match the length with the original
            min_len = min(len(audio), len(audio_female_formant))
            audio_female_formant = audio_female_formant[:min_len]
            
            # Save the result
            sf.write(output_path, audio_female_formant, sample_rate)
            return output_path
            
        except Exception as e:
            print(f"Error during conversion: {e}")
            return None

# Initialize voice converter
voice_converter = VoiceConverter()

# Create Gradio interface
def convert_voice(audio_file):
    """Function to handle the Gradio interface"""
    # Create a temporary file path for the output
    input_filename = os.path.basename(audio_file)
    output_filename = f"female_{input_filename}"
    output_path = os.path.join(os.path.dirname(audio_file), output_filename)
    
    # Perform voice conversion
    result = voice_converter.convert_to_female(audio_file, output_path)
    
    if result:
        return result
    else:
        return None

# Define the Gradio interface
demo = gr.Interface(
    fn=convert_voice,
    inputs=gr.Audio(type="filepath", label="Upload Male Voice Audio"),
    outputs=gr.Audio(label="Converted Female Voice"),
    title="Voice Gender Conversion (Male to Female)",
    description="Upload an audio file with a male voice to convert it to a female voice using AI.",
    examples=[["sample1.wav"], ["sample2.wav"]],
    theme=gr.themes.Soft()
)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)