File size: 6,199 Bytes
1b2f7fd
3b0f5c1
 
 
 
 
ec585dc
3b0f5c1
 
 
89d2d90
3b0f5c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89d2d90
3b0f5c1
 
 
 
 
89d2d90
3b0f5c1
89d2d90
3b0f5c1
 
 
 
 
 
 
 
 
 
89d2d90
3b0f5c1
 
 
 
 
 
 
 
 
 
89d2d90
 
ec585dc
89d2d90
3b0f5c1
89d2d90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import librosa
import librosa.display
import numpy as np
from pydub import AudioSegment
import io
import os

# Function to convert any audio to WAV using pydub
def convert_to_wav(audio_file_path):
    try:
        audio = AudioSegment.from_file(audio_file_path)
        wav_file_path = audio_file_path + ".wav"
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    except Exception as e:
        raise gr.Error(f"Error converting audio to WAV: {e}")

# Main voice changer function (simplified)
def voice_changer(source_audio_path, target_audio_path):
    if source_audio_path is None or target_audio_path is None:
        raise gr.Error("Please upload both source and target audio files.")

    # Ensure audio files are in WAV format
    source_wav_path = convert_to_wav(source_audio_path)
    target_wav_path = convert_to_wav(target_audio_path)

    try:
        # Load audio files
        y_source, sr_source = librosa.load(source_wav_path, sr=None)
        y_target, sr_target = librosa.load(target_wav_path, sr=None)

        # Resample target audio to source sample rate if different
        if sr_source != sr_target:
            y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
            print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")


        # --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
        # This is a very basic approach and not a full timbre transfer.
        # It tries to align the dominant pitch of the target with the source.

        # 1. Pitch Estimation for Source
        f0_source, voiced_flag_source, voiced_probs_source = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
        try:
            f0_source, _, _ = librosa.pyin(y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source, frame_length=2048)
        except Exception as e:
            print(f"Pyin failed for source, trying different params or fallback: {e}")
            f0_source, _, _ = librosa.pyin(y_source, fmin=60, fmax=500, sr=sr_source, frame_length=2048) # More robust range


        # 2. Estimate F0 for Target
        f0_target, voiced_flag_target, voiced_probs_target = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
        try:
            f0_target, _, _ = librosa.pyin(y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target, frame_length=2048)
        except Exception as e:
            print(f"Pyin failed for target, trying different params or fallback: {e}")
            f0_target, _, _ = librosa.pyin(y_target, fmin=60, fmax=500, sr=sr_target, frame_length=2048) # More robust range


        # Handle NaN values in f0_source (unvoiced segments)
        f0_source_interpolated = np.nan_to_num(f0_source, nan=0.0)
        f0_target_interpolated = np.nan_to_num(f0_target, nan=0.0)

        # Calculate a simple pitch shift ratio based on mean F0
        # This is very simplistic and doesn't account for variations over time.
        # A more advanced approach would involve temporal alignment and mapping.
        mean_f0_source = np.mean(f0_source_interpolated[f0_source_interpolated > 0])
        mean_f0_target = np.mean(f0_target_interpolated[f0_target_interpolated > 0])

        if mean_f0_target > 0 and mean_f0_source > 0:
            pitch_shift_factor = mean_f0_source / mean_f0_target
        else:
            pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected

        # Apply a pitch shift to the target audio
        # Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
        # This is not PSOLA and can introduce artifacts.
        # The `n_steps` argument is in semitones.
        n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0

        # Adjust the duration of the target audio to roughly match the source
        # This is a crude time stretching/compressing
        duration_ratio = len(y_source) / len(y_target)
        y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)

        # Apply pitch shift to the tempo-adjusted target audio
        y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)

        # Normalize the output audio to prevent clipping
        y_output = librosa.util.normalize(y_output)

        # Create a temporary file to save the output audio
        output_file_path = "output_voice_changed.wav"
        sf.write(output_file_path, y_output, sr_source)

        return output_file_path

    except Exception as e:
        raise gr.Error(f"An error occurred during voice processing: {e}")
    finally:
        # Clean up temporary WAV files
        if os.path.exists(source_wav_path):
            os.remove(source_wav_path)
        if os.path.exists(target_wav_path):
            os.remove(target_wav_path)

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Simple Audio Style Transfer (Voice Changer - Experimental)
        Upload two audio files. The goal is to make the "Target Audio" mimic the pitch/melody of the "Source Audio".
        **Note:** This is a very basic implementation and **not a full voice cloning/timbre transfer**.
        It performs a simplified pitch and tempo adjustment based on the source's characteristics.
        Expect artifacts and limited "voice changing" effect. For true voice cloning, more advanced models are needed.
        """
    )

    with gr.Row():
        source_audio_input = gr.Audio(type="filepath", label="Source Audio (Reference Voice/Style)", sources=["upload"])
        target_audio_input = gr.Audio(type="filepath", label="Target Audio (Voice to be Changed)", sources=["upload"])

    output_audio = gr.Audio(label="Transformed Audio")

    voice_changer_button = gr.Button("Transform Voice")

    voice_changer_button.click(
        fn=voice_changer,
        inputs=[source_audio_input, target_audio_input],
        outputs=output_audio
    )

if __name__ == "__main__":
    import soundfile as sf # Required for sf.write
    demo.launch()