File size: 1,861 Bytes
3f53728
a13c70d
11fdcf5
 
 
820c332
29f6b1d
 
820c332
11fdcf5
 
 
 
 
a13c70d
3f53728
a13c70d
820c332
006f2a8
820c332
29f6b1d
 
 
820c332
 
d3b49fc
29f6b1d
006f2a8
29f6b1d
 
d3b49fc
820c332
29f6b1d
 
d3b49fc
820c332
29f6b1d
820c332
 
 
d3b49fc
820c332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a13c70d
006f2a8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import spaces
import gradio as gr
import torch
from TTS.api import TTS
import os
import json
import scipy.io.wavfile as wavfile
import numpy as np

os.environ["COQUI_TOS_AGREED"] = "1"

device = "cuda"

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

@spaces.GPU(enable_queue=True)
def clone(text, audio):
    # Generowanie mowy
    wav, alignment, text_info, _ = tts.tts(text=text, speaker_wav=audio, language="pl", return_dict=True)
    
    # Konwersja do numpy array i zapisanie jako plik WAV
    wav_np = np.array(wav)
    wavfile.write("./output.wav", 24000, (wav_np * 32767).astype(np.int16))
    
    # Przygotowanie informacji o fonemach
    phonemes_data = []
    cumulative_duration = 0
    for phoneme, duration in zip(text_info['phonemes'], alignment['durations']):
        start_time = cumulative_duration
        end_time = start_time + duration
        phonemes_data.append({
            "phoneme": phoneme,
            "start": float(start_time),
            "end": float(end_time),
            "duration": float(duration)
        })
        cumulative_duration = end_time
    
    # Zapisywanie informacji o fonemach do pliku JSON
    with open("./phonemes_info.json", "w", encoding="utf-8") as f:
        json.dump(phonemes_data, f, ensure_ascii=False, indent=2)
    
    return "./output.wav", "./phonemes_info.json"

# Interfejs Gradio
iface = gr.Interface(
    fn=clone, 
    inputs=[
        gr.Textbox(label='Tekst do syntezy'),
        gr.Audio(type='filepath', label='Plik audio z głosem referencyjnym')
    ], 
    outputs=[
        gr.Audio(type='filepath', label='Zsyntezowana mowa'),
        gr.File(label='Informacje o fonemach (JSON)')
    ],
    title='Klonowanie Głosu z Informacjami o Fonemach',
    theme=gr.themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate")
)

iface.launch()