Spaces:

suprimedev
/

cloner

Running

App Files Files Community

cloner / app.py

suprimedev

Update app.py

303ab27 verified 2 months ago

raw

history blame contribute delete

7.6 kB

	import gradio as gr
	import librosa
	import numpy as np
	from pydub import AudioSegment
	import io
	import os
	import soundfile as sf # Required for sf.write

	# Function to convert any audio to WAV using pydub
	def convert_to_wav(audio_file_path):
	try:
	audio = AudioSegment.from_file(audio_file_path)
	# Create a temporary file path for WAV
	wav_file_path = audio_file_path + ".wav"
	audio.export(wav_file_path, format="wav")
	return wav_file_path
	except Exception as e:
	raise gr.Error(f"Error converting audio to WAV: {e}")

	# Main voice changer function (simplified)
	def voice_changer(source_audio_path, target_audio_path):
	if source_audio_path is None or target_audio_path is None:
	raise gr.Error("Please upload both source and target audio files.")

	# Ensure audio files are in WAV format
	source_wav_path = None
	target_wav_path = None

	try:
	source_wav_path = convert_to_wav(source_audio_path)
	target_wav_path = convert_to_wav(target_audio_path)

	# Load audio files
	y_source, sr_source = librosa.load(source_wav_path, sr=None)
	y_target, sr_target = librosa.load(target_wav_path, sr=None)

	# Resample target audio to source sample rate if different
	if sr_source != sr_target:
	y_target = librosa.resample(y_target, orig_sr=sr_target, target_sr=sr_source)
	print(f"Resampled target audio from {sr_target} to {sr_source} Hz.")

	# --- Simplified Voice Transfer Logic (Melody/Rhythm Transfer) ---
	# This is a very basic approach and not a full timbre transfer.
	# It tries to align the dominant pitch of the target with the source.

	# 1. Pitch Estimation for Source
	# librosa.pyin returns (f0, voiced_flag, voiced_probabilities)
	try:
	f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
	y_source, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_source
	# frame_length argument is not directly for pyin in newer librosa versions
	# It's usually inferred from hop_length for features, or not needed for pyin directly
	)
	except Exception as e:
	print(f"Pyin failed for source with general range, trying broader range: {e}")
	f0_source, voiced_flag_source, voiced_probs_source = librosa.pyin(
	y_source, fmin=60, fmax=500, sr=sr_source # More robust range for typical speech
	)

	# 2. Estimate F0 for Target
	try:
	f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
	y_target, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr_target
	)
	except Exception as e:
	print(f"Pyin failed for target with general range, trying broader range: {e}")
	f0_target, voiced_flag_target, voiced_probs_target = librosa.pyin(
	y_target, fmin=60, fmax=500, sr=sr_target # More robust range for typical speech
	)

	# Handle NaN values in f0 (unvoiced segments)
	# Replace NaN with 0, so they don't affect mean calculation, but also limit to voiced segments
	f0_source_valid = f0_source[~np.isnan(f0_source)]
	f0_target_valid = f0_target[~np.isnan(f0_target)]

	# Calculate a simple pitch shift ratio based on mean F0
	# This is very simplistic and doesn't account for variations over time.
	# A more advanced approach would involve temporal alignment and mapping.
	mean_f0_source = np.mean(f0_source_valid) if len(f0_source_valid) > 0 else 0
	mean_f0_target = np.mean(f0_target_valid) if len(f0_target_valid) > 0 else 0

	if mean_f0_target > 0.1 and mean_f0_source > 0.1: # Check for very small positive values
	pitch_shift_factor = mean_f0_source / mean_f0_target
	else:
	pitch_shift_factor = 1.0 # No pitch shift if no valid pitch detected or both are silent

	# Apply a pitch shift to the target audio
	# Using a simple `librosa.effects.pitch_shift` which is based on phase vocoder.
	# This is not PSOLA and can introduce artifacts.
	# The `n_steps` argument is in semitones.
	# log2(pitch_shift_factor) * 12 gives us semitones
	n_steps = 12 * np.log2(pitch_shift_factor) if pitch_shift_factor > 0 else 0
	print(f"Calculated pitch shift: {n_steps:.2f} semitones.")


	# Adjust the duration of the target audio to roughly match the source
	# This is a crude time stretching/compressing
	# Using librosa.get_duration to handle potential discrepancies in array lengths
	duration_source = librosa.get_duration(y=y_source, sr=sr_source)
	duration_target = librosa.get_duration(y=y_target, sr=sr_target)

	# Avoid division by zero
	if duration_target > 0:
	duration_ratio = duration_source / duration_target
	else:
	duration_ratio = 1.0 # No time change if target has no duration

	print(f"Duration Source: {duration_source:.2f}s, Target: {duration_target:.2f}s, Ratio: {duration_ratio:.2f}")

	if duration_ratio != 1.0:
	# We need to compute an appropriate hop_length for time_stretch if rate is not int.
	# Using rate directly for time_stretch
	y_target_adjusted_tempo = librosa.effects.time_stretch(y_target, rate=duration_ratio)
	else:
	y_target_adjusted_tempo = y_target # No stretching needed

	# Apply pitch shift to the tempo-adjusted target audio
	y_output = librosa.effects.pitch_shift(y_target_adjusted_tempo, sr=sr_source, n_steps=n_steps)

	# Normalize the output audio to prevent clipping
	y_output = librosa.util.normalize(y_output)

	# Create a temporary file to save the output audio
	output_file_path = "output_voice_changed.wav"
	sf.write(output_file_path, y_output, sr_source)

	return output_file_path

	except Exception as e:
	raise gr.Error(f"An error occurred during voice processing: {e}")
	finally:
	# Clean up temporary WAV files irrespective of success/failure
	if source_wav_path and os.path.exists(source_wav_path):
	os.remove(source_wav_path)
	if target_wav_path and os.path.exists(target_wav_path):
	os.remove(target_wav_path)

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Simple Audio Style Transfer (Voice Changer - Experimental)
	Upload two audio files. The goal is to make the "Target Audio" mimic the pitch/melody of the "Source Audio".
	Note: This is a very basic implementation and not a full voice cloning/timbre transfer.
	It performs a simplified pitch and tempo adjustment based on the source's characteristics.
	Expect artifacts and limited "voice changing" effect. For true voice cloning, more advanced models are needed.
	"""
	)

	with gr.Row():
	source_audio_input = gr.Audio(type="filepath", label="Source Audio (Reference Voice/Style)", sources=["upload"])
	target_audio_input = gr.Audio(type="filepath", label="Target Audio (Voice to be Changed)", sources=["upload"])

	output_audio = gr.Audio(label="Transformed Audio")

	voice_changer_button = gr.Button("Transform Voice")

	voice_changer_button.click(
	fn=voice_changer,
	inputs=[source_audio_input, target_audio_input],
	outputs=output_audio
	)

	if __name__ == "__main__":
	demo.launch()