import spaces import gradio as gr import torch from TTS.api import TTS import os import json import scipy.io.wavfile as wavfile import numpy as np os.environ["COQUI_TOS_AGREED"] = "1" device = "cuda" tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) @spaces.GPU(enable_queue=True) def clone(text, audio): # Generowanie mowy wav = tts.tts(text=text, speaker_wav=audio, language="pl") # Konwersja do numpy array i zapisanie jako plik WAV wav_np = np.array(wav) wavfile.write("./output.wav", 24000, (wav_np * 32767).astype(np.int16)) # Uzyskanie informacji o fonemach phonemes_info = tts.synthesizer.get_phonemes(text, language="pl") # Przygotowanie informacji o fonemach phonemes_data = [] cumulative_duration = 0 for phoneme, duration in phonemes_info: start_time = cumulative_duration end_time = start_time + duration phonemes_data.append({ "phoneme": phoneme, "start": float(start_time), "end": float(end_time), "duration": float(duration) }) cumulative_duration = end_time # Zapisywanie informacji o fonemach do pliku JSON with open("./phonemes_info.json", "w", encoding="utf-8") as f: json.dump(phonemes_data, f, ensure_ascii=False, indent=2) return "./output.wav", "./phonemes_info.json" # Interfejs Gradio iface = gr.Interface( fn=clone, inputs=[ gr.Textbox(label='Tekst do syntezy'), gr.Audio(type='filepath', label='Plik audio z głosem referencyjnym') ], outputs=[ gr.Audio(type='filepath', label='Zsyntezowana mowa'), gr.File(label='Informacje o fonemach (JSON)') ], title='Klonowanie Głosu z Informacjami o Fonemach', theme=gr.themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate") ) iface.launch(share=True)