Spaces:
Paused
Paused
File size: 3,798 Bytes
11eb5d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
from gradio.inputs import Textbox
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
import boto3
from io import BytesIO
import os
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors", split="validation")
# speaker ids from the embeddings dataset
speakers = {
'awb': 0, # Scottish male
'bdl': 1138, # US male
'clb': 2271, # US female
'jmk': 3403, # Canadian male
'ksp': 4535, # Indian male
'rms': 5667, # US male
'slt': 6799 # US female
}
def generateAudio(text_to_audio, s3_save_as, key_id):
if AWS_ACCESS_KEY_ID != key_id:
return "not permition"
s3_save_as = '-'.join(s3_save_as.split()) + ".wav"
def cut_text(text, max_tokens=500):
# Remove non-alphanumeric characters, except periods and commas
text = re.sub(r"[^\w\s.,]", "", text)
# Replace multiple spaces with a single space
text = re.sub(r"\s{2,}", " ", text)
# Remove line breaks
text = re.sub(r"\n", " ", text)
return text
def save_audio_to_s3(audio):
# Create an instance of the S3 client
s3 = boto3.client('s3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
# Full path of the file in the bucket
s3_key = "public/" + s3_save_as
# Upload the audio file to the S3 bucket
s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)
def save_text_to_speech(text, speaker=None):
# Preprocess text and recortar
text = cut_text(text, max_tokens=500)
# Divide el texto en segmentos de 30 palabras
palabras = text.split()
segmentos = [' '.join(palabras[i:i+30]) for i in range(0, len(palabras), 30)]
# Generar audio para cada segmento y combinarlos
audio_segments = []
for segment in segmentos:
inputs = processor(text=segment, return_tensors="pt").to(device)
if speaker is not None:
speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
else:
speaker_embeddings = torch.randn((1, 512)).to(device)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
audio_segments.append(speech)
combined_audio = torch.cat(audio_segments, dim=0)
# Crear objeto BytesIO para almacenar el audio
audio_buffer = BytesIO()
sf.write(audio_buffer, combined_audio.cpu().numpy(), samplerate=16000, format='WAV')
audio_buffer.seek(0)
# Guardar el audio combinado en S3
save_audio_to_s3(audio_buffer)
save_text_to_speech(text_to_audio, 2271)
return s3_save_as
iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url"), Textbox(label="aws_key_id")], outputs="text", title="Text-to-Audio")
iface.launch()
|