File size: 3,082 Bytes
37acd6d
2d78591
37acd6d
2d78591
 
 
 
 
 
 
 
37acd6d
2d78591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37acd6d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
from gradio.inputs import Textbox

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
import nltk
from nltk.tokenize import word_tokenize

device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained(
    "microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained(
    "microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset(
    "Matthijs/cmu-arctic-xvectors", split="validation")

# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}

def generateAudio(text_to_audio, s3_save_as):
    
    def recortar_texto(texto, max_tokens=500):
        tokens = word_tokenize(texto)
        if len(tokens) <= max_tokens:
            return texto
        
        recortado = ' '.join(tokens[:max_tokens])
        return recortado


    def save_text_to_speech(text, speaker=None):
        # Preprocess text and recortar
        text = recortar_texto(text, max_tokens=500)
        # preprocess text
        inputs = processor(text=text, return_tensors="pt").to(device)
        if speaker is not None:
            # load xvector containing speaker's voice characteristics from a dataset
            speaker_embeddings = torch.tensor(
                embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
        else:
            # random vector, meaning a random voice
            speaker_embeddings = torch.randn((1, 512)).to(device)
        # generate speech with the models
        speech = model.generate_speech(
            inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        if speaker is not None:
            # if we have a speaker, we use the speaker's ID in the filename
            output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
        else:
            # if we don't have a speaker, we use a random string in the filename
            random_str = ''.join(random.sample(
                string.ascii_letters+string.digits, k=5))
            output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
        # save the generated speech to a file with 16KHz sampling rate
        sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
        # return the filename for reference
        return output_filename



    output_filename = save_text_to_speech(text_to_audio, 2271)
    
    return f"Saved {output_filename}"

iface = gr.Interface(fn=text_to_image, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
iface.launch()