Spaces:

OuroborosM
/

STLA-BABY

Runtime error

App Files Files Community

OuroborosM commited on Aug 24, 2023

Commit

d84903c

1 Parent(s): 1b59e3b

update tts

Browse files

Files changed (1) hide show

app.py +28 -2

app.py CHANGED Viewed

@@ -84,6 +84,9 @@ from scipy.io import wavfile
 import re
 ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 sample = ds[0]["audio"]
@@ -121,7 +124,30 @@ def text_to_speech_loc(text):
     print("audio: ", audio)
     return audio
-print("text to speech: ", text_to_speech_loc("Good morning."))
 class GPTRemote(LLM):
     n: int
@@ -593,7 +619,7 @@ Text2Sound_tool2 = Tool(
 Text2Sound_tool_loc = Tool(
     name = "Text To Sound API 2",
     # func = Text2Sound,
-    func = text_to_speech_loc,
     description = "Useful when you need to convert text into sound file."
 )

 import re
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+import torch
 ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 sample = ds[0]["audio"]
     print("audio: ", audio)
     return audio
+def text_to_speech_loc2(text):
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    inputs = processor(text="Hello, my dear. Glad to see you. hahahaha...", return_tensors="pt")
+    # load xvector containing speaker's voice characteristics from a dataset
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    print("Type of speech: ", type(speech))
+    timestr = time.strftime("%Y%m%d-%H%M%S")
+    # sampling_rate = 16000
+    with open('sample-' + timestr + '.wav', 'wb') as audio:
+        sf.write(audio, speech.numpy(), samplerate=16000)
+    # audio = sf.write("convert1.wav", speech, samplerate=16000)
+    print("audio: ", audio)
+    return audio
+print("text to speech2: ", text_to_speech_loc2("Good morning."))
 class GPTRemote(LLM):
     n: int
 Text2Sound_tool_loc = Tool(
     name = "Text To Sound API 2",
     # func = Text2Sound,
+    func = text_to_speech_loc2,
     description = "Useful when you need to convert text into sound file."
 )