Spaces:
Running
Running
File size: 7,603 Bytes
1b2f7fd 3b0f5c1 ec585dc 303ab27 3b0f5c1 89d2d90 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 303ab27 3b0f5c1 89d2d90 303ab27 3b0f5c1 303ab27 3b0f5c1 89d2d90 3b0f5c1 89d2d90 3b0f5c1 89d2d90 3b0f5c1 89d2d90 ec585dc 89d2d90 303ab27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import gradio as gr
import librosa
import numpy as np
from pydub import AudioSegment
import io
import os
import soundfile as sf # Required for sf.write
# Function to convert any audio to WAV using pydub
def convert_to_wav(audio_file_path):
try:
audio = AudioSegment.from_file(audio_file_path)
# Create a temporary file path for WAV
wav_file_path = audio_file_path + ".wav"
audio.export(wav_file_path, format="wav")
return wav_file_path
except Exception as e:
raise gr.Error(f"Error converting audio to WAV: {e}")
# Main voice changer function (simplified)
def voice_changer(source_audio_path, target_audio_path):
if source_audio_path is None or target_audio_path is None:
raise gr.Error("Please upload both source and target audio files.")
# Ensure audio files are in WAV format
source_wav_path = None
target_wav_path = None
try:
source_wav_path = convert_to_wav(source_audio_path)
target_wav_path = convert_to_wav(target_audio_path)
# Load audio files
y_source, sr_source = librosa.load(source_wav_path, sr=None)
y_target, sr_target = librosa.load(target_wav_path, sr=None)
# Resample target audio to source sample rate if different
if sr_source != sr_target:
y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")
# --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
# This is a very basic approach and not a full timbre transfer.
# It tries to align the dominant pitch of the target with the source.
# 1. Pitch Estimation for Source
# librosa.pyin returns (f0, voiced_flag, voiced_probabilities)
try:
f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source
# frame_length argument is not directly for pyin in newer librosa versions
# It's usually inferred from hop_length for features, or not needed for pyin directly
)
except Exception as e:
print(f"Pyin failed for source with general range, trying broader range: {e}")
f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
y_source, fmin=60, fmax=500, sr=sr_source # More robust range for typical speech
)
# 2. Estimate F0 for Target
try:
f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target
)
except Exception as e:
print(f"Pyin failed for target with general range, trying broader range: {e}")
f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
y_target, fmin=60, fmax=500, sr=sr_target # More robust range for typical speech
)
# Handle NaN values in f0 (unvoiced segments)
# Replace NaN with 0, so they don't affect mean calculation, but also limit to voiced segments
f0_source_valid = f0_source[~np.isnan(f0_source)]
f0_target_valid = f0_target[~np.isnan(f0_target)]
# Calculate a simple pitch shift ratio based on mean F0
# This is very simplistic and doesn't account for variations over time.
# A more advanced approach would involve temporal alignment and mapping.
mean_f0_source = np.mean(f0_source_valid) if len(f0_source_valid) > 0 else 0
mean_f0_target = np.mean(f0_target_valid) if len(f0_target_valid) > 0 else 0
if mean_f0_target > 0.1 and mean_f0_source > 0.1: # Check for very small positive values
pitch_shift_factor = mean_f0_source / mean_f0_target
else:
pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected or both are silent
# Apply a pitch shift to the target audio
# Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
# This is not PSOLA and can introduce artifacts.
# The `n_steps` argument is in semitones.
# log2(pitch_shift_factor) * 12 gives us semitones
n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0
print(f"Calculated pitch shift: {n_steps:.2f} semitones.")
# Adjust the duration of the target audio to roughly match the source
# This is a crude time stretching/compressing
# Using librosa.get_duration to handle potential discrepancies in array lengths
duration_source = librosa.get_duration(y=y_source, sr=sr_source)
duration_target = librosa.get_duration(y=y_target, sr=sr_target)
# Avoid division by zero
if duration_target > 0:
duration_ratio = duration_source / duration_target
else:
duration_ratio = 1.0 # No time change if target has no duration
print(f"Duration Source: {duration_source:.2f}s, Target: {duration_target:.2f}s, Ratio: {duration_ratio:.2f}")
if duration_ratio != 1.0:
# We need to compute an appropriate hop_length for time_stretch if rate is not int.
# Using rate directly for time_stretch
y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)
else:
y_target_adjusted_tempo = y_target # No stretching needed
# Apply pitch shift to the tempo-adjusted target audio
y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)
# Normalize the output audio to prevent clipping
y_output = librosa.util.normalize(y_output)
# Create a temporary file to save the output audio
output_file_path = "output_voice_changed.wav"
sf.write(output_file_path, y_output, sr_source)
return output_file_path
except Exception as e:
raise gr.Error(f"An error occurred during voice processing: {e}")
finally:
# Clean up temporary WAV files irrespective of success/failure
if source_wav_path and os.path.exists(source_wav_path):
os.remove(source_wav_path)
if target_wav_path and os.path.exists(target_wav_path):
os.remove(target_wav_path)
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown(
"""
# Simple Audio Style Transfer (Voice Changer - Experimental)
Upload two audio files. The goal is to make the "Target Audio" mimic the pitch/melody of the "Source Audio".
**Note:** This is a very basic implementation and **not a full voice cloning/timbre transfer**.
It performs a simplified pitch and tempo adjustment based on the source's characteristics.
Expect artifacts and limited "voice changing" effect. For true voice cloning, more advanced models are needed.
"""
)
with gr.Row():
source_audio_input = gr.Audio(type="filepath", label="Source Audio (Reference Voice/Style)", sources=["upload"])
target_audio_input = gr.Audio(type="filepath", label="Target Audio (Voice to be Changed)", sources=["upload"])
output_audio = gr.Audio(label="Transformed Audio")
voice_changer_button = gr.Button("Transform Voice")
voice_changer_button.click(
fn=voice_changer,
inputs=[source_audio_input, target_audio_input],
outputs=output_audio
)
if __name__ == "__main__":
demo.launch()
|