Spaces:

PineSearch
/

generateAudio

Paused

App Files Files Community

SAUL19 commited on Jun 23, 2023

Commit

e79dd51

1 Parent(s): db8bd10

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -28

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import gradio as gr
 from gradio.inputs import Textbox
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
 import re
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
@@ -19,7 +21,6 @@ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # load the processor
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 # load the model
@@ -28,7 +29,7 @@ model = SpeechT5ForTextToSpeech.from_pretrained(
 # load the vocoder, that is the voice encoder
 vocoder = SpeechT5HifiGan.from_pretrained(
     "microsoft/speecht5_hifigan").to(device)
-# load the dataset to get the speaker embeddings
 embeddings_dataset = load_dataset(
     "Matthijs/cmu-arctic-xvectors", split="validation")
@@ -44,18 +45,29 @@ speakers = {
 }
 def generateAudio(text_to_audio, s3_save_as):
     def cut_text(text, max_tokens=500):
         # Remove non-alphanumeric characters, except periods and commas
         text = re.sub(r"[^\w\s.,]", "", text)
-        tokens = word_tokenize(text_to_audio)
         if len(tokens) <= max_tokens:
             return text
         cut = ' '.join(tokens[:max_tokens])
         return cut
     def save_text_to_speech(text, speaker=None):
         # Preprocess text and recortar
@@ -74,39 +86,27 @@ def generateAudio(text_to_audio, s3_save_as):
             inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
         if speaker is not None:
             # if we have a speaker, we use the speaker's ID in the filename
-            output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
         else:
             # if we don't have a speaker, we use a random string in the filename
             random_str = ''.join(random.sample(
                 string.ascii_letters+string.digits, k=5))
-            output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
-        # Save the generated speech to BytesIO buffer
         audio_buffer = BytesIO()
-        sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000)
         audio_buffer.seek(0)
-        # Upload the audio buffer to S3
-        s3_key = f"{s3_save_as}.mp3"
-        s3 = boto3.client(
-            's3',
-            aws_access_key_id=AWS_ACCESS_KEY_ID,
-            aws_secret_access_key=AWS_SECRET_ACCESS_KEY
-        )
-        s3.upload_fileobj(audio_buffer, S3_BUCKET_NAME, s3_key)
-        # Return the S3 URL of the uploaded audio file
-        s3_url = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{s3_key}"
-        return s3_url
-    s3_url = save_text_to_speech(text_to_audio, speakers["clb"])
-    return f"Saved audio: {s3_url}"
-iface = gr.Interface(
-    fn=generateAudio,
-    inputs=[Textbox(label="Text to Audio"), Textbox(label="S3 Save As")],
-    outputs="text"
-)
 iface.launch()

 import gradio as gr
 from gradio.inputs import Textbox
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
 import re
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # load the processor
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 # load the model
 # load the vocoder, that is the voice encoder
 vocoder = SpeechT5HifiGan.from_pretrained(
     "microsoft/speecht5_hifigan").to(device)
+# we load this dataset to get the speaker embeddings
 embeddings_dataset = load_dataset(
     "Matthijs/cmu-arctic-xvectors", split="validation")
 }
 def generateAudio(text_to_audio, s3_save_as):
     def cut_text(text, max_tokens=500):
         # Remove non-alphanumeric characters, except periods and commas
         text = re.sub(r"[^\w\s.,]", "", text)
+        tokens = word_tokenize(text)
         if len(tokens) <= max_tokens:
             return text
         cut = ' '.join(tokens[:max_tokens])
         return cut
+    def save_audio_to_s3(audio, filename):
+        # Create an instance of the S3 client
+        s3 = boto3.client('s3',
+                          aws_access_key_id=AWS_ACCESS_KEY_ID,
+                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
+        # Full path of the file in the bucket
+        s3_key = "public/" + filename
+        # Upload the audio file to the S3 bucket
+        s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)
     def save_text_to_speech(text, speaker=None):
         # Preprocess text and recortar
             inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
         if speaker is not None:
             # if we have a speaker, we use the speaker's ID in the filename
+            output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.wav"
         else:
             # if we don't have a speaker, we use a random string in the filename
             random_str = ''.join(random.sample(
                 string.ascii_letters+string.digits, k=5))
+            output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.wav"
+        # create BytesIO object to store the audio
         audio_buffer = BytesIO()
+        # save the generated speech to the BytesIO buffer
+        sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000, format='WAV')
         audio_buffer.seek(0)
+        # Save the audio to S3
+        save_audio_to_s3(audio_buffer, output_filename)
+        # return the filename for reference
+        return output_filename
+    output_filename = save_text_to_speech(text_to_audio, "clb")
+    return f"Saved {output_filename}"
+iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
 iface.launch()