File size: 7,603 Bytes
1b2f7fd
3b0f5c1
 
 
 
ec585dc
303ab27
3b0f5c1
 
 
89d2d90
3b0f5c1
303ab27
3b0f5c1
 
 
 
 
 
 
 
 
 
 
 
303ab27
 
3b0f5c1
 
303ab27
 
 
3b0f5c1
 
 
 
 
 
 
 
 
 
 
 
 
 
303ab27
3b0f5c1
303ab27
 
 
 
 
3b0f5c1
303ab27
 
 
 
3b0f5c1
 
 
303ab27
 
 
3b0f5c1
303ab27
 
 
 
3b0f5c1
303ab27
 
 
 
3b0f5c1
 
 
 
303ab27
 
3b0f5c1
303ab27
3b0f5c1
 
303ab27
3b0f5c1
 
 
 
 
303ab27
3b0f5c1
303ab27
 
3b0f5c1
 
 
303ab27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b0f5c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89d2d90
303ab27
 
3b0f5c1
303ab27
3b0f5c1
89d2d90
3b0f5c1
89d2d90
3b0f5c1
 
 
 
 
 
 
 
 
 
89d2d90
3b0f5c1
 
 
 
 
 
 
 
 
 
89d2d90
 
ec585dc
89d2d90
 
303ab27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import gradio as gr
import librosa
import numpy as np
from pydub import AudioSegment
import io
import os
import soundfile as sf # Required for sf.write

# Function to convert any audio to WAV using pydub
def convert_to_wav(audio_file_path):
    try:
        audio = AudioSegment.from_file(audio_file_path)
        # Create a temporary file path for WAV
        wav_file_path = audio_file_path + ".wav"
        audio.export(wav_file_path, format="wav")
        return wav_file_path
    except Exception as e:
        raise gr.Error(f"Error converting audio to WAV: {e}")

# Main voice changer function (simplified)
def voice_changer(source_audio_path, target_audio_path):
    if source_audio_path is None or target_audio_path is None:
        raise gr.Error("Please upload both source and target audio files.")

    # Ensure audio files are in WAV format
    source_wav_path = None
    target_wav_path = None

    try:
        source_wav_path = convert_to_wav(source_audio_path)
        target_wav_path = convert_to_wav(target_audio_path)

        # Load audio files
        y_source, sr_source = librosa.load(source_wav_path, sr=None)
        y_target, sr_target = librosa.load(target_wav_path, sr=None)

        # Resample target audio to source sample rate if different
        if sr_source != sr_target:
            y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
            print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")

        # --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
        # This is a very basic approach and not a full timbre transfer.
        # It tries to align the dominant pitch of the target with the source.

        # 1. Pitch Estimation for Source
        # librosa.pyin returns (f0, voiced_flag, voiced_probabilities)
        try:
            f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
                y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source
                # frame_length argument is not directly for pyin in newer librosa versions
                # It's usually inferred from hop_length for features, or not needed for pyin directly
            )
        except Exception as e:
            print(f"Pyin failed for source with general range, trying broader range: {e}")
            f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
                y_source, fmin=60, fmax=500, sr=sr_source # More robust range for typical speech
            )

        # 2. Estimate F0 for Target
        try:
            f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
                y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target
            )
        except Exception as e:
            print(f"Pyin failed for target with general range, trying broader range: {e}")
            f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
                y_target, fmin=60, fmax=500, sr=sr_target # More robust range for typical speech
            )

        # Handle NaN values in f0 (unvoiced segments)
        # Replace NaN with 0, so they don't affect mean calculation, but also limit to voiced segments
        f0_source_valid = f0_source[~np.isnan(f0_source)]
        f0_target_valid = f0_target[~np.isnan(f0_target)]

        # Calculate a simple pitch shift ratio based on mean F0
        # This is very simplistic and doesn't account for variations over time.
        # A more advanced approach would involve temporal alignment and mapping.
        mean_f0_source = np.mean(f0_source_valid) if len(f0_source_valid) > 0 else 0
        mean_f0_target = np.mean(f0_target_valid) if len(f0_target_valid) > 0 else 0

        if mean_f0_target > 0.1 and mean_f0_source > 0.1: # Check for very small positive values
            pitch_shift_factor = mean_f0_source / mean_f0_target
        else:
            pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected or both are silent

        # Apply a pitch shift to the target audio
        # Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
        # This is not PSOLA and can introduce artifacts.
        # The `n_steps` argument is in semitones.
        # log2(pitch_shift_factor) * 12 gives us semitones
        n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0
        print(f"Calculated pitch shift: {n_steps:.2f} semitones.")


        # Adjust the duration of the target audio to roughly match the source
        # This is a crude time stretching/compressing
        # Using librosa.get_duration to handle potential discrepancies in array lengths
        duration_source = librosa.get_duration(y=y_source, sr=sr_source)
        duration_target = librosa.get_duration(y=y_target, sr=sr_target)

        # Avoid division by zero
        if duration_target > 0:
            duration_ratio = duration_source / duration_target
        else:
            duration_ratio = 1.0 # No time change if target has no duration

        print(f"Duration Source: {duration_source:.2f}s, Target: {duration_target:.2f}s, Ratio: {duration_ratio:.2f}")

        if duration_ratio != 1.0:
            # We need to compute an appropriate hop_length for time_stretch if rate is not int.
            # Using rate directly for time_stretch
            y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)
        else:
            y_target_adjusted_tempo = y_target # No stretching needed

        # Apply pitch shift to the tempo-adjusted target audio
        y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)

        # Normalize the output audio to prevent clipping
        y_output = librosa.util.normalize(y_output)

        # Create a temporary file to save the output audio
        output_file_path = "output_voice_changed.wav"
        sf.write(output_file_path, y_output, sr_source)

        return output_file_path

    except Exception as e:
        raise gr.Error(f"An error occurred during voice processing: {e}")
    finally:
        # Clean up temporary WAV files irrespective of success/failure
        if source_wav_path and os.path.exists(source_wav_path):
            os.remove(source_wav_path)
        if target_wav_path and os.path.exists(target_wav_path):
            os.remove(target_wav_path)

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Simple Audio Style Transfer (Voice Changer - Experimental)
        Upload two audio files. The goal is to make the "Target Audio" mimic the pitch/melody of the "Source Audio".
        **Note:** This is a very basic implementation and **not a full voice cloning/timbre transfer**.
        It performs a simplified pitch and tempo adjustment based on the source's characteristics.
        Expect artifacts and limited "voice changing" effect. For true voice cloning, more advanced models are needed.
        """
    )

    with gr.Row():
        source_audio_input = gr.Audio(type="filepath", label="Source Audio (Reference Voice/Style)", sources=["upload"])
        target_audio_input = gr.Audio(type="filepath", label="Target Audio (Voice to be Changed)", sources=["upload"])

    output_audio = gr.Audio(label="Transformed Audio")

    voice_changer_button = gr.Button("Transform Voice")

    voice_changer_button.click(
        fn=voice_changer,
        inputs=[source_audio_input, target_audio_input],
        outputs=output_audio
    )

if __name__ == "__main__":
    demo.launch()