Spaces:
Paused
Paused
File size: 3,519 Bytes
37acd6d 2d78591 e79dd51 e345c78 e79dd51 21e33f9 2d78591 21e33f9 37acd6d 2d78591 e79dd51 2d78591 b5f38ca 21e33f9 e79dd51 2d78591 21e33f9 2d78591 21e33f9 2d78591 b5f38ca e79dd51 b5f38ca e79dd51 2d78591 21e33f9 2d78591 b5f38ca e79dd51 21e33f9 e79dd51 21e33f9 2d78591 e79dd51 b5f38ca 2d78591 b5f38ca 2d78591 b5f38ca 21e33f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
from gradio.inputs import Textbox
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
import boto3
from io import BytesIO
import os
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors", split="validation")
# speaker ids from the embeddings dataset
speakers = {
'awb': 0, # Scottish male
'bdl': 1138, # US male
'clb': 2271, # US female
'jmk': 3403, # Canadian male
'ksp': 4535, # Indian male
'rms': 5667, # US male
'slt': 6799 # US female
}
def generateAudio(text_to_audio, s3_save_as):
s3_save_as = '-'.join(save_as.split()) + ".wav"
def cut_text(text, max_tokens=500):
# Remove non-alphanumeric characters, except periods and commas
text = re.sub(r"[^\w\s.,]", "", text)
tokens = word_tokenize(text)
if len(tokens) <= max_tokens:
return text
cut = ' '.join(tokens[:max_tokens])
return cut
def save_audio_to_s3(audio):
# Create an instance of the S3 client
s3 = boto3.client('s3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
# Full path of the file in the bucket
s3_key = "public/" + s3_save_as
# Upload the audio file to the S3 bucket
s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)
def save_text_to_speech(text, speaker=None):
# Preprocess text and recortar
text = cut_text(text, max_tokens=500)
# preprocess text
inputs = processor(text=text, return_tensors="pt").to(device)
if speaker is not None:
# load xvector containing speaker's voice characteristics from a dataset
speaker_embeddings = torch.tensor(
embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
else:
# random vector, meaning a random voice
speaker_embeddings = torch.randn((1, 512)).to(device)
# generate speech with the models
speech = model.generate_speech(
inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# create BytesIO object to store the audio
audio_buffer = BytesIO()
# save the generated speech to the BytesIO buffer
sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000, format='WAV')
audio_buffer.seek(0)
# Save the audio to S3
save_audio_to_s3(audio_buffer)
save_text_to_speech(text_to_audio, 2271)
return s3_save_as
iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="
")], outputs="text")
iface.launch()
|