Spaces:

0qwpifs
/

VoiceReplacer

Sleeping

File size: 3,282 Bytes

import gradio as gr
import torchaudio
import torch
import os
from speechbrain.inference import SpeakerRecognition

# Путь к вашему голосовому образцу
user_voice_path = "voice_recording.wav"

# Загрузка модели
model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb",
)

def process_audio(input_audio, pitch_shift=0):
    try:
        # Проверяем, является ли input_audio кортежем (путь к файлу, sample_rate)
        if isinstance(input_audio, tuple):
            input_audio_path = input_audio[0]
        elif isinstance(input_audio, str):
            input_audio_path = input_audio
        else:
            raise ValueError(f"Неподдерживаемый формат входных данных: {type(input_audio)}")
        
        # Проверка наличия голосового образца
        if not os.path.exists(user_voice_path):
            raise FileNotFoundError(f"Файл голосового образца не найден по пути: {user_voice_path}")
        
        # Загрузка аудиофайлов
        user_waveform, user_sr = torchaudio.load(user_voice_path)
        target_waveform, target_sr = torchaudio.load(input_audio_path)
        
        # Приведение к одинаковой частоте дискретизации
        if user_sr != target_sr:
            target_waveform = torchaudio.functional.resample(target_waveform, target_sr, user_sr)
        
        # Изменение тона голоса
        if pitch_shift != 0:
            target_waveform = torchaudio.functional.pitch_shift(
                waveform=target_waveform,
                sample_rate=user_sr,
                n_steps=pitch_shift
            )
        
        # Перенос голоса
        with torch.no_grad():
            embeddings_user = model.encode_batch(user_waveform)
            embeddings_target = model.encode_batch(target_waveform)
            converted_embeddings = embeddings_user + (embeddings_target - embeddings_user) * 0.5
        
        # Восстановление аудио из embeddings
        converted_waveform = model.synth_model.generate(
            converted_embeddings,
            length=target_waveform.shape[-1]
        )
        
        # Сохранение результата
        output_path = "converted_audio.wav"
        torchaudio.save(output_path, converted_waveform.cpu(), user_sr)
        
        return output_path
    except Exception as e:
        print(f"Ошибка: {str(e)}")
        return None

demo = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(label="Загрузите аудиофайл с голосом для замены", type="filepath"),
        gr.Slider(-24, 24, 0, step=1, label="Изменение тона (в полутонов)")
    ],
    outputs=gr.Audio(label="Обработанный аудиофайл"),
    title="VoiceReplacer Pro",
    description="Замените голос в аудиофайле на ваш собственный голос с возможностью изменения тона"
)

demo.launch()