fahadqazi commited on
Commit
2aebcac
·
verified ·
1 Parent(s): 535f4fd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import soundfile as sf
4
+ import spaces
5
+ import os
6
+ import numpy as np
7
+ import re
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+ from speechbrain.pretrained import EncoderClassifier
10
+ from datasets import load_dataset
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+
14
+ def load_models_and_data():
15
+ model_name = "microsoft/speecht5_tts"
16
+ processor = SpeechT5Processor.from_pretrained(model_name)
17
+ model = SpeechT5ForTextToSpeech.from_pretrained("fahadqazi/testts1234").to(device)
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
+
20
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
21
+ speaker_model = EncoderClassifier.from_hparams(
22
+ source=spk_model_name,
23
+ run_opts={"device": device},
24
+ savedir=os.path.join("/tmp", spk_model_name),
25
+ )
26
+
27
+ # Load a sample from a dataset for default embedding
28
+ dataset = load_dataset("erenfazlioglu/turkishvoicedataset", split="train")
29
+ example = dataset[304]
30
+
31
+ return model, processor, vocoder, speaker_model, example
32
+
33
+ model, processor, vocoder, speaker_model, default_example = load_models_and_data()
34
+
35
+ # def create_speaker_embedding(waveform):
36
+ # with torch.no_grad():
37
+ # speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
38
+ # speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
39
+ # speaker_embeddings = speaker_embeddings.squeeze()
40
+ # return speaker_embeddings
41
+
42
+ # def prepare_default_embedding(example):
43
+ # audio = example["audio"]
44
+ # return create_speaker_embedding(audio["array"])
45
+
46
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
47
+ speaker_embeddings = embeddings_dataset[7306]["xvector"]
48
+ speaker_embeddings = torch.tensor(speaker_embeddings).to(device)
49
+
50
+ default_embedding = speaker_embeddings #prepare_default_embedding(default_example)
51
+
52
+ # replacements = [
53
+ # ("â", "a"), # Long a
54
+ # ("ç", "ch"), # Ch as in "chair"
55
+ # ("ğ", "gh"), # Silent g or slight elongation of the preceding vowel
56
+ # ("ı", "i"), # Dotless i
57
+ # ("î", "i"), # Long i
58
+ # ("ö", "oe"), # Similar to German ö
59
+ # ("ş", "sh"), # Sh as in "shoe"
60
+ # ("ü", "ue"), # Similar to German ü
61
+ # ("û", "u"), # Long u
62
+ # ]
63
+
64
+ number_words = {
65
+ 0: "sıfır", 1: "bir", 2: "iki", 3: "üç", 4: "dört", 5: "beş", 6: "altı", 7: "yedi", 8: "sekiz", 9: "dokuz",
66
+ 10: "on", 11: "on bir", 12: "on iki", 13: "on üç", 14: "on dört", 15: "on beş", 16: "on altı", 17: "on yedi",
67
+ 18: "on sekiz", 19: "on dokuz", 20: "yirmi", 30: "otuz", 40: "kırk", 50: "elli", 60: "altmış", 70: "yetmiş",
68
+ 80: "seksen", 90: "doksan", 100: "yüz", 1000: "bin"
69
+ }
70
+
71
+ def number_to_words(number):
72
+ if number < 20:
73
+ return number_words[number]
74
+ elif number < 100:
75
+ tens, unit = divmod(number, 10)
76
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
77
+ elif number < 1000:
78
+ hundreds, remainder = divmod(number, 100)
79
+ return (number_words[hundreds] + " yüz" if hundreds > 1 else "yüz") + (" " + number_to_words(remainder) if remainder else "")
80
+ elif number < 1000000:
81
+ thousands, remainder = divmod(number, 1000)
82
+ return (number_to_words(thousands) + " bin" if thousands > 1 else "bin") + (" " + number_to_words(remainder) if remainder else "")
83
+ elif number < 1000000000:
84
+ millions, remainder = divmod(number, 1000000)
85
+ return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "")
86
+ elif number < 1000000000000:
87
+ billions, remainder = divmod(number, 1000000000)
88
+ return number_to_words(billions) + " milyar" + (" " + number_to_words(remainder) if remainder else "")
89
+ else:
90
+ return str(number)
91
+
92
+ def replace_numbers_with_words(text):
93
+ def replace(match):
94
+ number = int(match.group())
95
+ return number_to_words(number)
96
+
97
+ # Find the numbers and change with words.
98
+ result = re.sub(r'\b\d+\b', replace, text)
99
+
100
+ return result
101
+
102
+ def normalize_text(text):
103
+ # Convert to lowercase
104
+ # text = text.lower()
105
+
106
+ # Replace numbers with words
107
+ text = replace_numbers_with_words(text)
108
+
109
+ # Apply character replacements
110
+ # for old, new in replacements:
111
+ # text = text.replace(old, new)
112
+
113
+ # Remove punctuation
114
+ text = re.sub(r'[^\w\s]', '', text)
115
+
116
+ return text
117
+
118
+ @spaces.GPU(duration=60)
119
+ def text_to_speech(text, audio_file=None):
120
+ # Normalize the input text
121
+ normalized_text = normalize_text(text)
122
+
123
+ # Prepare the input for the model
124
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
125
+
126
+ # Use the default speaker embedding
127
+ speaker_embeddings = default_embedding
128
+
129
+ # Generate speech
130
+ with torch.no_grad():
131
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
132
+
133
+ speech_np = speech.cpu().numpy()
134
+
135
+ return (16000, speech_np)
136
+
137
+ iface = gr.Interface(
138
+ fn=text_to_speech,
139
+ inputs=[
140
+ gr.Textbox(label="Enter Sindhi text to convert to speech")
141
+ ],
142
+ outputs=[
143
+ gr.Audio(label="Generated Speech", type="numpy")
144
+ ],
145
+ title="Sindhi SpeechT5 Text-to-Speech Demo",
146
+ description="Enter Sindhi text, and listen to the generated speech."
147
+ )
148
+
149
+ iface.launch(share=True)