Spaces:
Sleeping
Sleeping
File size: 6,199 Bytes
1b2f7fd 3b0f5c1 ec585dc 3b0f5c1 89d2d90 3b0f5c1 89d2d90 3b0f5c1 89d2d90 3b0f5c1 89d2d90 3b0f5c1 89d2d90 3b0f5c1 89d2d90 ec585dc 89d2d90 3b0f5c1 89d2d90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import gradio as gr
import librosa
import librosa.display
import numpy as np
from pydub import AudioSegment
import io
import os
# Function to convert any audio to WAV using pydub
def convert_to_wav(audio_file_path):
try:
audio = AudioSegment.from_file(audio_file_path)
wav_file_path = audio_file_path + ".wav"
audio.export(wav_file_path, format="wav")
return wav_file_path
except Exception as e:
raise gr.Error(f"Error converting audio to WAV: {e}")
# Main voice changer function (simplified)
def voice_changer(source_audio_path, target_audio_path):
if source_audio_path is None or target_audio_path is None:
raise gr.Error("Please upload both source and target audio files.")
# Ensure audio files are in WAV format
source_wav_path = convert_to_wav(source_audio_path)
target_wav_path = convert_to_wav(target_audio_path)
try:
# Load audio files
y_source, sr_source = librosa.load(source_wav_path, sr=None)
y_target, sr_target = librosa.load(target_wav_path, sr=None)
# Resample target audio to source sample rate if different
if sr_source != sr_target:
y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")
# --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
# This is a very basic approach and not a full timbre transfer.
# It tries to align the dominant pitch of the target with the source.
# 1. Pitch Estimation for Source
f0_source, voiced_flag_source, voiced_probs_source = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
try:
f0_source, _, _ = librosa.pyin(y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source, frame_length=2048)
except Exception as e:
print(f"Pyin failed for source, trying different params or fallback: {e}")
f0_source, _, _ = librosa.pyin(y_source, fmin=60, fmax=500, sr=sr_source, frame_length=2048) # More robust range
# 2. Estimate F0 for Target
f0_target, voiced_flag_target, voiced_probs_target = librosa.display.cqt_frequencies(n_bins=84, fmin=librosa.note_to_hz('C1')).T, None, None
try:
f0_target, _, _ = librosa.pyin(y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target, frame_length=2048)
except Exception as e:
print(f"Pyin failed for target, trying different params or fallback: {e}")
f0_target, _, _ = librosa.pyin(y_target, fmin=60, fmax=500, sr=sr_target, frame_length=2048) # More robust range
# Handle NaN values in f0_source (unvoiced segments)
f0_source_interpolated = np.nan_to_num(f0_source, nan=0.0)
f0_target_interpolated = np.nan_to_num(f0_target, nan=0.0)
# Calculate a simple pitch shift ratio based on mean F0
# This is very simplistic and doesn't account for variations over time.
# A more advanced approach would involve temporal alignment and mapping.
mean_f0_source = np.mean(f0_source_interpolated[f0_source_interpolated > 0])
mean_f0_target = np.mean(f0_target_interpolated[f0_target_interpolated > 0])
if mean_f0_target > 0 and mean_f0_source > 0:
pitch_shift_factor = mean_f0_source / mean_f0_target
else:
pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected
# Apply a pitch shift to the target audio
# Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
# This is not PSOLA and can introduce artifacts.
# The `n_steps` argument is in semitones.
n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0
# Adjust the duration of the target audio to roughly match the source
# This is a crude time stretching/compressing
duration_ratio = len(y_source) / len(y_target)
y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)
# Apply pitch shift to the tempo-adjusted target audio
y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)
# Normalize the output audio to prevent clipping
y_output = librosa.util.normalize(y_output)
# Create a temporary file to save the output audio
output_file_path = "output_voice_changed.wav"
sf.write(output_file_path, y_output, sr_source)
return output_file_path
except Exception as e:
raise gr.Error(f"An error occurred during voice processing: {e}")
finally:
# Clean up temporary WAV files
if os.path.exists(source_wav_path):
os.remove(source_wav_path)
if os.path.exists(target_wav_path):
os.remove(target_wav_path)
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown(
"""
# Simple Audio Style Transfer (Voice Changer - Experimental)
Upload two audio files. The goal is to make the "Target Audio" mimic the pitch/melody of the "Source Audio".
**Note:** This is a very basic implementation and **not a full voice cloning/timbre transfer**.
It performs a simplified pitch and tempo adjustment based on the source's characteristics.
Expect artifacts and limited "voice changing" effect. For true voice cloning, more advanced models are needed.
"""
)
with gr.Row():
source_audio_input = gr.Audio(type="filepath", label="Source Audio (Reference Voice/Style)", sources=["upload"])
target_audio_input = gr.Audio(type="filepath", label="Target Audio (Voice to be Changed)", sources=["upload"])
output_audio = gr.Audio(label="Transformed Audio")
voice_changer_button = gr.Button("Transform Voice")
voice_changer_button.click(
fn=voice_changer,
inputs=[source_audio_input, target_audio_input],
outputs=output_audio
)
if __name__ == "__main__":
import soundfile as sf # Required for sf.write
demo.launch()
|