SAUL19 commited on
Commit
11eb5d0
·
1 Parent(s): f00a1a5

Create old_app

Browse files
Files changed (1) hide show
  1. old_app +110 -0
old_app ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.inputs import Textbox
3
+
4
+ import re
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset
7
+ import torch
8
+ import random
9
+ import string
10
+ import soundfile as sf
11
+ import boto3
12
+ from io import BytesIO
13
+ import os
14
+
15
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
16
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
17
+ S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
18
+
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ # load the processor
21
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
22
+ # load the model
23
+ model = SpeechT5ForTextToSpeech.from_pretrained(
24
+ "microsoft/speecht5_tts").to(device)
25
+ # load the vocoder, that is the voice encoder
26
+ vocoder = SpeechT5HifiGan.from_pretrained(
27
+ "microsoft/speecht5_hifigan").to(device)
28
+ # we load this dataset to get the speaker embeddings
29
+ embeddings_dataset = load_dataset(
30
+ "Matthijs/cmu-arctic-xvectors", split="validation")
31
+
32
+ # speaker ids from the embeddings dataset
33
+ speakers = {
34
+ 'awb': 0, # Scottish male
35
+ 'bdl': 1138, # US male
36
+ 'clb': 2271, # US female
37
+ 'jmk': 3403, # Canadian male
38
+ 'ksp': 4535, # Indian male
39
+ 'rms': 5667, # US male
40
+ 'slt': 6799 # US female
41
+ }
42
+
43
+ def generateAudio(text_to_audio, s3_save_as, key_id):
44
+
45
+ if AWS_ACCESS_KEY_ID != key_id:
46
+ return "not permition"
47
+
48
+ s3_save_as = '-'.join(s3_save_as.split()) + ".wav"
49
+
50
+ def cut_text(text, max_tokens=500):
51
+ # Remove non-alphanumeric characters, except periods and commas
52
+ text = re.sub(r"[^\w\s.,]", "", text)
53
+
54
+ # Replace multiple spaces with a single space
55
+ text = re.sub(r"\s{2,}", " ", text)
56
+
57
+ # Remove line breaks
58
+ text = re.sub(r"\n", " ", text)
59
+
60
+ return text
61
+
62
+ def save_audio_to_s3(audio):
63
+ # Create an instance of the S3 client
64
+ s3 = boto3.client('s3',
65
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
66
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
67
+
68
+ # Full path of the file in the bucket
69
+ s3_key = "public/" + s3_save_as
70
+
71
+ # Upload the audio file to the S3 bucket
72
+ s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)
73
+
74
+ def save_text_to_speech(text, speaker=None):
75
+ # Preprocess text and recortar
76
+ text = cut_text(text, max_tokens=500)
77
+
78
+ # Divide el texto en segmentos de 30 palabras
79
+ palabras = text.split()
80
+ segmentos = [' '.join(palabras[i:i+30]) for i in range(0, len(palabras), 30)]
81
+
82
+ # Generar audio para cada segmento y combinarlos
83
+ audio_segments = []
84
+ for segment in segmentos:
85
+ inputs = processor(text=segment, return_tensors="pt").to(device)
86
+ if speaker is not None:
87
+ speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
88
+ else:
89
+ speaker_embeddings = torch.randn((1, 512)).to(device)
90
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
91
+ audio_segments.append(speech)
92
+
93
+ combined_audio = torch.cat(audio_segments, dim=0)
94
+
95
+ # Crear objeto BytesIO para almacenar el audio
96
+ audio_buffer = BytesIO()
97
+ sf.write(audio_buffer, combined_audio.cpu().numpy(), samplerate=16000, format='WAV')
98
+ audio_buffer.seek(0)
99
+
100
+ # Guardar el audio combinado en S3
101
+ save_audio_to_s3(audio_buffer)
102
+
103
+
104
+ save_text_to_speech(text_to_audio, 2271)
105
+ return s3_save_as
106
+
107
+
108
+ iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url"), Textbox(label="aws_key_id")], outputs="text", title="Text-to-Audio")
109
+ iface.launch()
110
+