File size: 5,777 Bytes
37acd6d
2d78591
e79dd51
21e33f9
2d78591
 
 
 
21e33f9
 
 
0b22baf
 
21e33f9
 
 
 
0b22baf
37acd6d
2d78591
 
 
 
 
 
 
 
 
e79dd51
2d78591
 
 
 
 
 
 
 
 
 
 
 
 
 
0b22baf
f00a1a5
 
 
 
fc96cf9
77e4720
fc96cf9
dd06693
21e33f9
 
fc96cf9
1f7e24e
 
fc96cf9
1f7e24e
 
e10a399
 
2d78591
b5f38ca
fc96cf9
 
 
 
 
 
 
 
 
 
 
c5e2889
fc96cf9
3b6208e
fc96cf9
3b6208e
2d78591
 
 
21e33f9
fc96cf9
 
e10a399
fc96cf9
 
 
 
 
 
e50afa4
fc96cf9
e50afa4
 
fc96cf9
 
 
 
 
 
 
 
 
 
 
 
cb831b1
 
b5f38ca
 
2d78591
e50afa4
0b22baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea1deaf
 
f0b791d
bbe0e74
ea1deaf
f0b791d
ea1deaf
0b22baf
ea1deaf
0b22baf
 
 
 
 
04db253
0b22baf
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import gradio as gr
from gradio.inputs import Textbox

import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
import boto3
from io import BytesIO
import os
import botocore
from time import sleep

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
FOLDER = 'public/mdx/'

device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained(
    "microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained(
    "microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset(
    "Matthijs/cmu-arctic-xvectors", split="validation")

# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}


def generateAudio(text_to_audio, s3_save_as, key_id):

    if AWS_ACCESS_KEY_ID != key_id:
        return "not permition"
            
    s3_save_as = '-'.join(s3_save_as.split()) + ".wav"
    
    def cut_text(text, max_tokens=500):
        # Remove non-alphanumeric characters, except periods and commas
        text = re.sub(r"[^\w\s.,]", "", text)
    
        # Replace multiple spaces with a single space
        text = re.sub(r"\s{2,}", " ", text)
    
        # Remove line breaks
        text = re.sub(r"\n", " ", text)

        return text

    def save_audio_to_s3(audio):
        try:
            # Create an instance of the S3 client
            s3 = boto3.client('s3',
                              aws_access_key_id=AWS_ACCESS_KEY_ID,
                              aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
    
            # Full path of the file in the bucket
            s3_key = "public/" + s3_save_as
    
            # Upload the audio file to the S3 bucket
            s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)
            print("SUCCESS SAVE IN S3 WHERE" + s3_key + " & " + S3_BUCKET_NAME)
            
        except Exception as err:
            print("Error al guardar")
            print(err)

    def save_text_to_speech(text, speaker=None):
        # Preprocess text and recortar
        text = cut_text(text, max_tokens=500)
        
        # Divide el texto en segmentos de 30 palabras
        palabras = text.split()
        segmentos = [' '.join(palabras[i:i+30]) for i in range(0, len(palabras), 30)]
        
        # Generar audio para cada segmento y combinarlos
        audio_segments = []
        for segment in segmentos:
            inputs = processor(text=segment, return_tensors="pt").to(device)
            if speaker is not None:
                speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
            else:
                speaker_embeddings = torch.randn((1, 512)).to(device)
            speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
            audio_segments.append(speech)
        
        combined_audio = torch.cat(audio_segments, dim=0)
        
        # Crear objeto BytesIO para almacenar el audio
        audio_buffer = BytesIO()
        sf.write(audio_buffer, combined_audio.cpu().numpy(), samplerate=16000, format='WAV')
        audio_buffer.seek(0)
        
        # Guardar el audio combinado en S3
        save_audio_to_s3(audio_buffer)
    
    
    save_text_to_speech(text_to_audio, 2271)
    return s3_save_as


def check_if_exist(bucket_name, key):

    s3 = boto3.resource('s3',
                        aws_access_key_id=AWS_ACCESS_KEY_ID,
                        aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    try:
        s3.Object(bucket_name, key).load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            # The object does not exist.
            return False
        else:
            # Something else has gone wrong.
            raise
    else:
        return True


def list_s3_files():

    s3_client = boto3.client('s3',
                             aws_access_key_id=AWS_ACCESS_KEY_ID,
                             aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    s3 = boto3.resource('s3',
                        aws_access_key_id=AWS_ACCESS_KEY_ID,
                        aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    my_bucket = s3.Bucket(S3_BUCKET_NAME)

    for objects in my_bucket.objects.filter(Prefix=FOLDER):

        filename_ext = '%s' % os.path.basename(objects.key)
        filename = os.path.splitext(filename_ext)[0]
        s3audio = 'public/%s.wav' % filename

        if check_if_exist(S3_BUCKET_NAME, s3audio):
            print('Audio %s already exists!' % s3audio)
        else:
            KEY = 'public/mdx/' + filename_ext
            response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=KEY)
            content = response['Body'].read().decode('utf-8')

            if (len(content)):
                generateAudio(content, filename, AWS_ACCESS_KEY_ID)
                print("SUCCESS " + filename + ".wap")
            else:
                print("NOT CONTENT:" + filename_ext + ".md")

        sleep(500/1000)


demo = gr.Blocks()

with demo:

    text = gr.Textbox()

    bimage = gr.Button("Generate Blog Images for PineSearch!")

    bimage.click(list_s3_files, outputs=text)

demo.launch()