Spaces:

PineSearch
/

generateAudio

Paused

File size: 3,848 Bytes

import gradio as gr
from gradio.inputs import Textbox
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
import boto3
from io import BytesIO
import os

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_BUCKET_NAME = os.getenv("BUCKET_NAME")

device = "cuda" if torch.cuda.is_available() else "cpu"

# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained(
    "microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained(
    "microsoft/speecht5_hifigan").to(device)
# load the dataset to get the speaker embeddings
embeddings_dataset = load_dataset(
    "Matthijs/cmu-arctic-xvectors", split="validation")

# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}

def generateAudio(text_to_audio, s3_save_as):
    
    def cut_text(text, max_tokens=500):
        # Remove non-alphanumeric characters, except periods and commas
        text = re.sub(r"[^\w\s.,]", "", text)
        
        tokens = word_tokenize(text_to_audio)
        if len(tokens) <= max_tokens:
            return text
        
        cut = ' '.join(tokens[:max_tokens])
        return cut


    def save_text_to_speech(text, speaker=None):
        # Preprocess text and recortar
        text = cut_text(text, max_tokens=500)
        # preprocess text
        inputs = processor(text=text, return_tensors="pt").to(device)
        if speaker is not None:
            # load xvector containing speaker's voice characteristics from a dataset
            speaker_embeddings = torch.tensor(
                embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
        else:
            # random vector, meaning a random voice
            speaker_embeddings = torch.randn((1, 512)).to(device)
        # generate speech with the models
        speech = model.generate_speech(
            inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        if speaker is not None:
            # if we have a speaker, we use the speaker's ID in the filename
            output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
        else:
            # if we don't have a speaker, we use a random string in the filename
            random_str = ''.join(random.sample(
                string.ascii_letters+string.digits, k=5))
            output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"

        # Save the generated speech to BytesIO buffer
        audio_buffer = BytesIO()
        sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000)
        audio_buffer.seek(0)

        # Upload the audio buffer to S3
        s3_key = f"{s3_save_as}.mp3"
        s3 = boto3.client(
            's3',
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY
        )
        s3.upload_fileobj(audio_buffer, S3_BUCKET_NAME, s3_key)

        # Return the S3 URL of the uploaded audio file
        s3_url = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{s3_key}"
        return s3_url


    s3_url = save_text_to_speech(text_to_audio, speakers["clb"])
    return f"Saved audio: {s3_url}"


iface = gr.Interface(
    fn=generateAudio,
    inputs=[Textbox(label="Text to Audio"), Textbox(label="S3 Save As")],
    outputs="text"
)
iface.launch()