File size: 3,798 Bytes
11eb5d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
from gradio.inputs import Textbox

import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
import boto3
from io import BytesIO
import os

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_BUCKET_NAME = os.getenv("BUCKET_NAME")

device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained(
    "microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained(
    "microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset(
    "Matthijs/cmu-arctic-xvectors", split="validation")

# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}

def generateAudio(text_to_audio, s3_save_as, key_id):

    if AWS_ACCESS_KEY_ID != key_id:
        return "not permition"
            
    s3_save_as = '-'.join(s3_save_as.split()) + ".wav"
    
    def cut_text(text, max_tokens=500):
        # Remove non-alphanumeric characters, except periods and commas
        text = re.sub(r"[^\w\s.,]", "", text)
    
        # Replace multiple spaces with a single space
        text = re.sub(r"\s{2,}", " ", text)
    
        # Remove line breaks
        text = re.sub(r"\n", " ", text)

        return text

    def save_audio_to_s3(audio):
        # Create an instance of the S3 client
        s3 = boto3.client('s3',
                          aws_access_key_id=AWS_ACCESS_KEY_ID,
                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

        # Full path of the file in the bucket
        s3_key = "public/" + s3_save_as

        # Upload the audio file to the S3 bucket
        s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)

    def save_text_to_speech(text, speaker=None):
        # Preprocess text and recortar
        text = cut_text(text, max_tokens=500)
        
        # Divide el texto en segmentos de 30 palabras
        palabras = text.split()
        segmentos = [' '.join(palabras[i:i+30]) for i in range(0, len(palabras), 30)]
        
        # Generar audio para cada segmento y combinarlos
        audio_segments = []
        for segment in segmentos:
            inputs = processor(text=segment, return_tensors="pt").to(device)
            if speaker is not None:
                speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
            else:
                speaker_embeddings = torch.randn((1, 512)).to(device)
            speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
            audio_segments.append(speech)
        
        combined_audio = torch.cat(audio_segments, dim=0)
        
        # Crear objeto BytesIO para almacenar el audio
        audio_buffer = BytesIO()
        sf.write(audio_buffer, combined_audio.cpu().numpy(), samplerate=16000, format='WAV')
        audio_buffer.seek(0)
        
        # Guardar el audio combinado en S3
        save_audio_to_s3(audio_buffer)
    
    
    save_text_to_speech(text_to_audio, 2271)
    return s3_save_as
    

iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url"), Textbox(label="aws_key_id")], outputs="text", title="Text-to-Audio")
iface.launch()