OuroborosM commited on
Commit
d84903c
·
1 Parent(s): 1b59e3b

update tts

Browse files
Files changed (1) hide show
  1. app.py +28 -2
app.py CHANGED
@@ -84,6 +84,9 @@ from scipy.io import wavfile
84
 
85
  import re
86
 
 
 
 
87
  ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
88
  sample = ds[0]["audio"]
89
 
@@ -121,7 +124,30 @@ def text_to_speech_loc(text):
121
  print("audio: ", audio)
122
  return audio
123
 
124
- print("text to speech: ", text_to_speech_loc("Good morning."))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  class GPTRemote(LLM):
127
  n: int
@@ -593,7 +619,7 @@ Text2Sound_tool2 = Tool(
593
  Text2Sound_tool_loc = Tool(
594
  name = "Text To Sound API 2",
595
  # func = Text2Sound,
596
- func = text_to_speech_loc,
597
  description = "Useful when you need to convert text into sound file."
598
  )
599
 
 
84
 
85
  import re
86
 
87
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
88
+ import torch
89
+
90
  ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
91
  sample = ds[0]["audio"]
92
 
 
124
  print("audio: ", audio)
125
  return audio
126
 
127
+ def text_to_speech_loc2(text):
128
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
129
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
130
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
131
+
132
+ inputs = processor(text="Hello, my dear. Glad to see you. hahahaha...", return_tensors="pt")
133
+
134
+ # load xvector containing speaker's voice characteristics from a dataset
135
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
136
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
137
+
138
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
139
+ print("Type of speech: ", type(speech))
140
+
141
+ timestr = time.strftime("%Y%m%d-%H%M%S")
142
+ # sampling_rate = 16000
143
+ with open('sample-' + timestr + '.wav', 'wb') as audio:
144
+ sf.write(audio, speech.numpy(), samplerate=16000)
145
+ # audio = sf.write("convert1.wav", speech, samplerate=16000)
146
+ print("audio: ", audio)
147
+ return audio
148
+
149
+
150
+ print("text to speech2: ", text_to_speech_loc2("Good morning."))
151
 
152
  class GPTRemote(LLM):
153
  n: int
 
619
  Text2Sound_tool_loc = Tool(
620
  name = "Text To Sound API 2",
621
  # func = Text2Sound,
622
+ func = text_to_speech_loc2,
623
  description = "Useful when you need to convert text into sound file."
624
  )
625