Spaces:
Runtime error
Runtime error
Commit
·
d84903c
1
Parent(s):
1b59e3b
update tts
Browse files
app.py
CHANGED
@@ -84,6 +84,9 @@ from scipy.io import wavfile
|
|
84 |
|
85 |
import re
|
86 |
|
|
|
|
|
|
|
87 |
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
88 |
sample = ds[0]["audio"]
|
89 |
|
@@ -121,7 +124,30 @@ def text_to_speech_loc(text):
|
|
121 |
print("audio: ", audio)
|
122 |
return audio
|
123 |
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
class GPTRemote(LLM):
|
127 |
n: int
|
@@ -593,7 +619,7 @@ Text2Sound_tool2 = Tool(
|
|
593 |
Text2Sound_tool_loc = Tool(
|
594 |
name = "Text To Sound API 2",
|
595 |
# func = Text2Sound,
|
596 |
-
func =
|
597 |
description = "Useful when you need to convert text into sound file."
|
598 |
)
|
599 |
|
|
|
84 |
|
85 |
import re
|
86 |
|
87 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
88 |
+
import torch
|
89 |
+
|
90 |
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
91 |
sample = ds[0]["audio"]
|
92 |
|
|
|
124 |
print("audio: ", audio)
|
125 |
return audio
|
126 |
|
127 |
+
def text_to_speech_loc2(text):
|
128 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
129 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
130 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
131 |
+
|
132 |
+
inputs = processor(text="Hello, my dear. Glad to see you. hahahaha...", return_tensors="pt")
|
133 |
+
|
134 |
+
# load xvector containing speaker's voice characteristics from a dataset
|
135 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
136 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
137 |
+
|
138 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
139 |
+
print("Type of speech: ", type(speech))
|
140 |
+
|
141 |
+
timestr = time.strftime("%Y%m%d-%H%M%S")
|
142 |
+
# sampling_rate = 16000
|
143 |
+
with open('sample-' + timestr + '.wav', 'wb') as audio:
|
144 |
+
sf.write(audio, speech.numpy(), samplerate=16000)
|
145 |
+
# audio = sf.write("convert1.wav", speech, samplerate=16000)
|
146 |
+
print("audio: ", audio)
|
147 |
+
return audio
|
148 |
+
|
149 |
+
|
150 |
+
print("text to speech2: ", text_to_speech_loc2("Good morning."))
|
151 |
|
152 |
class GPTRemote(LLM):
|
153 |
n: int
|
|
|
619 |
Text2Sound_tool_loc = Tool(
|
620 |
name = "Text To Sound API 2",
|
621 |
# func = Text2Sound,
|
622 |
+
func = text_to_speech_loc2,
|
623 |
description = "Useful when you need to convert text into sound file."
|
624 |
)
|
625 |
|