SAUL19 commited on
Commit
e79dd51
·
1 Parent(s): db8bd10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -28
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import gradio as gr
2
  from gradio.inputs import Textbox
 
3
  import nltk
4
  nltk.download('punkt')
5
  from nltk.tokenize import word_tokenize
 
6
  import re
7
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
8
  from datasets import load_dataset
@@ -19,7 +21,6 @@ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
19
  S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
20
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
-
23
  # load the processor
24
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
25
  # load the model
@@ -28,7 +29,7 @@ model = SpeechT5ForTextToSpeech.from_pretrained(
28
  # load the vocoder, that is the voice encoder
29
  vocoder = SpeechT5HifiGan.from_pretrained(
30
  "microsoft/speecht5_hifigan").to(device)
31
- # load the dataset to get the speaker embeddings
32
  embeddings_dataset = load_dataset(
33
  "Matthijs/cmu-arctic-xvectors", split="validation")
34
 
@@ -44,18 +45,29 @@ speakers = {
44
  }
45
 
46
  def generateAudio(text_to_audio, s3_save_as):
47
-
48
  def cut_text(text, max_tokens=500):
49
  # Remove non-alphanumeric characters, except periods and commas
50
  text = re.sub(r"[^\w\s.,]", "", text)
51
 
52
- tokens = word_tokenize(text_to_audio)
53
  if len(tokens) <= max_tokens:
54
  return text
55
 
56
  cut = ' '.join(tokens[:max_tokens])
57
  return cut
58
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def save_text_to_speech(text, speaker=None):
61
  # Preprocess text and recortar
@@ -74,39 +86,27 @@ def generateAudio(text_to_audio, s3_save_as):
74
  inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
75
  if speaker is not None:
76
  # if we have a speaker, we use the speaker's ID in the filename
77
- output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
78
  else:
79
  # if we don't have a speaker, we use a random string in the filename
80
  random_str = ''.join(random.sample(
81
  string.ascii_letters+string.digits, k=5))
82
- output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
83
-
84
- # Save the generated speech to BytesIO buffer
85
  audio_buffer = BytesIO()
86
- sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000)
 
87
  audio_buffer.seek(0)
88
 
89
- # Upload the audio buffer to S3
90
- s3_key = f"{s3_save_as}.mp3"
91
- s3 = boto3.client(
92
- 's3',
93
- aws_access_key_id=AWS_ACCESS_KEY_ID,
94
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY
95
- )
96
- s3.upload_fileobj(audio_buffer, S3_BUCKET_NAME, s3_key)
97
-
98
- # Return the S3 URL of the uploaded audio file
99
- s3_url = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{s3_key}"
100
- return s3_url
101
 
 
 
102
 
103
- s3_url = save_text_to_speech(text_to_audio, speakers["clb"])
104
- return f"Saved audio: {s3_url}"
105
 
 
106
 
107
- iface = gr.Interface(
108
- fn=generateAudio,
109
- inputs=[Textbox(label="Text to Audio"), Textbox(label="S3 Save As")],
110
- outputs="text"
111
- )
112
  iface.launch()
 
1
  import gradio as gr
2
  from gradio.inputs import Textbox
3
+
4
  import nltk
5
  nltk.download('punkt')
6
  from nltk.tokenize import word_tokenize
7
+
8
  import re
9
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
  from datasets import load_dataset
 
21
  S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
22
 
23
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
24
  # load the processor
25
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
26
  # load the model
 
29
  # load the vocoder, that is the voice encoder
30
  vocoder = SpeechT5HifiGan.from_pretrained(
31
  "microsoft/speecht5_hifigan").to(device)
32
+ # we load this dataset to get the speaker embeddings
33
  embeddings_dataset = load_dataset(
34
  "Matthijs/cmu-arctic-xvectors", split="validation")
35
 
 
45
  }
46
 
47
  def generateAudio(text_to_audio, s3_save_as):
48
+
49
  def cut_text(text, max_tokens=500):
50
  # Remove non-alphanumeric characters, except periods and commas
51
  text = re.sub(r"[^\w\s.,]", "", text)
52
 
53
+ tokens = word_tokenize(text)
54
  if len(tokens) <= max_tokens:
55
  return text
56
 
57
  cut = ' '.join(tokens[:max_tokens])
58
  return cut
59
 
60
+ def save_audio_to_s3(audio, filename):
61
+ # Create an instance of the S3 client
62
+ s3 = boto3.client('s3',
63
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
64
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
65
+
66
+ # Full path of the file in the bucket
67
+ s3_key = "public/" + filename
68
+
69
+ # Upload the audio file to the S3 bucket
70
+ s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)
71
 
72
  def save_text_to_speech(text, speaker=None):
73
  # Preprocess text and recortar
 
86
  inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
87
  if speaker is not None:
88
  # if we have a speaker, we use the speaker's ID in the filename
89
+ output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.wav"
90
  else:
91
  # if we don't have a speaker, we use a random string in the filename
92
  random_str = ''.join(random.sample(
93
  string.ascii_letters+string.digits, k=5))
94
+ output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.wav"
95
+ # create BytesIO object to store the audio
 
96
  audio_buffer = BytesIO()
97
+ # save the generated speech to the BytesIO buffer
98
+ sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000, format='WAV')
99
  audio_buffer.seek(0)
100
 
101
+ # Save the audio to S3
102
+ save_audio_to_s3(audio_buffer, output_filename)
 
 
 
 
 
 
 
 
 
 
103
 
104
+ # return the filename for reference
105
+ return output_filename
106
 
107
+ output_filename = save_text_to_speech(text_to_audio, "clb")
 
108
 
109
+ return f"Saved {output_filename}"
110
 
111
+ iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
 
 
 
 
112
  iface.launch()