Spaces:

suprimedev
/

cloner

Sleeping

App Files Files Community

suprimedev commited on Jun 5

Commit

3b0f5c1

verified ·

1 Parent(s): 89d2d90

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -97

app.py CHANGED Viewed

@@ -1,109 +1,136 @@
 import gradio as gr
-import torchaudio
-from speechbrain.pretrained import EncoderClassifier
-from speechbrain.pretrained import HIFIGAN
-from speechbrain.pretrained import EncoderClassifier
-import torch
-import tempfile
 import os
-from pathlib import Path
-# Initialize models
-classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
-hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech")
-def extract_speaker_embedding(audio_file):
-    """Extract speaker embedding from audio file"""
-    signal, fs = torchaudio.load(audio_file)
-    # Resample if needed
-    if fs != 16000:
-        resampler = torchaudio.transforms.Resample(fs, 16000)
-        signal = resampler(signal)
-        fs = 16000
-    # Handle stereo audio
-    if signal.shape[0] > 1:
-        signal = torch.mean(signal, dim=0, keepdim=True)
-    embeddings = classifier.encode_batch(signal)
-    return embeddings.squeeze(0)
-def voice_conversion(source_audio, target_audio):
-    """Convert source voice to sound like target voice"""
-    # Create temp files
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as source_tmp, \
-         tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as target_tmp:
-        source_path = source_tmp.name
-        target_path = target_tmp.name
-    # Save uploaded files
-    source_audio.save(source_path)
-    target_audio.save(target_path)
     try:
-        # Extract source audio and target speaker embedding
-        source_signal, source_fs = torchaudio.load(source_path)
-        # Handle stereo audio
-        if source_signal.shape[0] > 1:
-            source_signal = torch.mean(source_signal, dim=0, keepdim=True)
-        # Resample source to 16kHz if needed
-        if source_fs != 16000:
-            resampler = torchaudio.transforms.Resample(source_fs, 16000)
-            source_signal = resampler(source_signal)
-            source_fs = 16000
-        # Extract target speaker embedding
-        target_emb = extract_speaker_embedding(target_path)
-        # Generate converted waveform
-        waveform = hifi_gan.generate(source_signal, speaker_emb=target_emb)
-        # Save output
-        output_path = os.path.join(tempfile.mkdtemp(), "output.wav")
-        torchaudio.save(output_path, waveform.squeeze(0).cpu(), 16000)
-        return output_path
     finally:
-        # Clean up temp files
-        os.unlink(source_path)
-        os.unlink(target_path)
-# Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ Voice Changer")
-    gr.Markdown("بارگذاری فایل صوتی اصلی و فایل صوتی هدف برای تبدیل صدای اول به سبک دوم")
     with gr.Row():
-        with gr.Column():
-            source_audio = gr.Audio(label="فایل صوتی اصلی (صدا برای تبدیل)", type="filepath")
-        with gr.Column():
-            target_audio = gr.Audio(label="فایل صوتی هدف (سبک مورد نظر)", type="filepath")
-    output_audio = gr.Audio(label="خروجی تبدیل شده", interactive=False)
-    convert_btn = gr.Button("تبدیل صوت")
-    convert_btn.click(
-        fn=voice_conversion,
-        inputs=[source_audio, target_audio],
         outputs=output_audio
     )
-    gr.Examples(
-        examples=[
-            [os.path.join(os.path.dirname(__file__), "examples/source1.wav"),
-             os.path.join(os.path.dirname(__file__), "examples/target1.wav")],
-            [os.path.join(os.path.dirname(__file__), "examples/source2.wav"),
-             os.path.join(os.path.dirname(__file__), "examples/target2.wav")]
-        ],
-        inputs=[source_audio, target_audio],
-        outputs=output_audio,
-        fn=voice_conversion,
-        cache_examples=True
-    )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import librosa
+import librosa.display
+import numpy as np
+from pydub import AudioSegment
+import io
 import os
+# Function to convert any audio to WAV using pydub
+def convert_to_wav(audio_file_path):
     try:
+        audio = AudioSegment.from_file(audio_file_path)
+        wav_file_path = audio_file_path + ".wav"
+        audio.export(wav_file_path, format="wav")
+        return wav_file_path
+    except Exception as e:
+        raise gr.Error(f"Error converting audio to WAV: {e}")
+# Main voice changer function (simplified)
+def voice_changer(source_audio_path, target_audio_path):
+    if source_audio_path is None or target_audio_path is None:
+        raise gr.Error("Please upload both source and target audio files.")
+    # Ensure audio files are in WAV format
+    source_wav_path = convert_to_wav(source_audio_path)
+    target_wav_path = convert_to_wav(target_audio_path)
+    try:
+        # Load audio files
+        y_source, sr_source = librosa.load(source_wav_path, sr=None)
+        y_target, sr_target = librosa.load(target_wav_path, sr=None)
+        # Resample target audio to source sample rate if different
+        if sr_source != sr_target:
+            y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
+            print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")
+        # --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
+        # This is a very basic approach and not a full timbre transfer.
+        # It tries to align the dominant pitch of the target with the source.
+        # 1. Pitch Estimation for Source
+        f0_source, voiced_flag_source, voiced_probs_source = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
+        try:
+            f0_source, _, _ = librosa.pyin(y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source, frame_length=2048)
+        except Exception as e:
+            print(f"Pyin failed for source, trying different params or fallback: {e}")
+            f0_source, _, _ = librosa.pyin(y_source, fmin=60, fmax=500, sr=sr_source, frame_length=2048) # More robust range
+        # 2. Estimate F0 for Target
+        f0_target, voiced_flag_target, voiced_probs_target = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
+        try:
+            f0_target, _, _ = librosa.pyin(y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target, frame_length=2048)
+        except Exception as e:
+            print(f"Pyin failed for target, trying different params or fallback: {e}")
+            f0_target, _, _ = librosa.pyin(y_target, fmin=60, fmax=500, sr=sr_target, frame_length=2048) # More robust range
+        # Handle NaN values in f0_source (unvoiced segments)
+        f0_source_interpolated = np.nan_to_num(f0_source, nan=0.0)
+        f0_target_interpolated = np.nan_to_num(f0_target, nan=0.0)
+        # Calculate a simple pitch shift ratio based on mean F0
+        # This is very simplistic and doesn't account for variations over time.
+        # A more advanced approach would involve temporal alignment and mapping.
+        mean_f0_source = np.mean(f0_source_interpolated[f0_source_interpolated > 0])
+        mean_f0_target = np.mean(f0_target_interpolated[f0_target_interpolated > 0])
+        if mean_f0_target > 0 and mean_f0_source > 0:
+            pitch_shift_factor = mean_f0_source / mean_f0_target
+        else:
+            pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected
+        # Apply a pitch shift to the target audio
+        # Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
+        # This is not PSOLA and can introduce artifacts.
+        # The `n_steps` argument is in semitones.
+        n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0
+        # Adjust the duration of the target audio to roughly match the source
+        # This is a crude time stretching/compressing
+        duration_ratio = len(y_source) / len(y_target)
+        y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)
+        # Apply pitch shift to the tempo-adjusted target audio
+        y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)
+        # Normalize the output audio to prevent clipping
+        y_output = librosa.util.normalize(y_output)
+        # Create a temporary file to save the output audio
+        output_file_path = "output_voice_changed.wav"
+        sf.write(output_file_path, y_output, sr_source)
+        return output_file_path
+    except Exception as e:
+        raise gr.Error(f"An error occurred during voice processing: {e}")
     finally:
+        # Clean up temporary WAV files
+        if os.path.exists(source_wav_path):
+            os.remove(source_wav_path)
+        if os.path.exists(target_wav_path):
+            os.remove(target_wav_path)
+# Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # Simple Audio Style Transfer (Voice Changer - Experimental)
+        Upload two audio files. The goal is to make the "Target Audio" mimic the pitch/melody of the "Source Audio".
+        **Note:** This is a very basic implementation and **not a full voice cloning/timbre transfer**.
+        It performs a simplified pitch and tempo adjustment based on the source's characteristics.
+        Expect artifacts and limited "voice changing" effect. For true voice cloning, more advanced models are needed.
+        """
+    )
     with gr.Row():
+        source_audio_input = gr.Audio(type="filepath", label="Source Audio (Reference Voice/Style)", sources=["upload"])
+        target_audio_input = gr.Audio(type="filepath", label="Target Audio (Voice to be Changed)", sources=["upload"])
+    output_audio = gr.Audio(label="Transformed Audio")
+    voice_changer_button = gr.Button("Transform Voice")
+    voice_changer_button.click(
+        fn=voice_changer,
+        inputs=[source_audio_input, target_audio_input],
         outputs=output_audio
     )
 if __name__ == "__main__":
+    import soundfile as sf # Required for sf.write
     demo.launch()