Spaces:
Running
Running
spaces cleanup
Browse files
app.py
CHANGED
|
@@ -45,18 +45,19 @@ AVAILABLE_MODELS = {
|
|
| 45 |
# 'VoiceCraft 2.0': 'voicecraft',
|
| 46 |
# 'Parler TTS': 'parler'
|
| 47 |
|
|
|
|
| 48 |
'coqui/xtts': 'coqui/xtts',
|
| 49 |
-
'collabora/WhisperSpeech': 'collabora/WhisperSpeech',
|
| 50 |
-
|
| 51 |
-
'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2',
|
| 52 |
-
'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1',
|
| 53 |
-
'Pendrokar/xVASynth': 'Pendrokar/xVASynth',
|
| 54 |
# 'coqui/CoquiTTS': 'coqui/CoquiTTS',
|
| 55 |
-
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS',
|
| 56 |
|
| 57 |
# Parler
|
| 58 |
-
'parler-tts/parler_tts': 'parler-tts/parler_tts',
|
| 59 |
-
'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso',
|
| 60 |
|
| 61 |
# TTS w issues
|
| 62 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
@@ -66,7 +67,7 @@ AVAILABLE_MODELS = {
|
|
| 66 |
# 'styletts2/styletts2': '0#0', # API disabled
|
| 67 |
# 'Manmay/tortoise-tts': '/predict#0', # Cannot skip text-from-file parameter
|
| 68 |
# 'pytorch/Tacotron2': '0#0', # old gradio
|
| 69 |
-
# 'mrfakename/MeloTTS': 'mrfakename/MeloTTS', #
|
| 70 |
# 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # old gradio - ValueError: Unsupported protocol: sse_v3
|
| 71 |
}
|
| 72 |
|
|
@@ -131,7 +132,7 @@ HF_SPACES = {
|
|
| 131 |
'mrfakename/MeloTTS': {
|
| 132 |
'name': 'mrfakename/MeloTTS',
|
| 133 |
'function': '/synthesize',
|
| 134 |
-
'text_param_index':
|
| 135 |
'return_audio_index': 0,
|
| 136 |
},
|
| 137 |
|
|
@@ -199,14 +200,14 @@ OVERRIDE_INPUTS = {
|
|
| 199 |
3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
|
| 200 |
},
|
| 201 |
'Pendrokar/xVASynth': {
|
| 202 |
-
1: 'ccby_nvidia_hifi_92_F', #fine-tuned voice model name
|
| 203 |
3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
|
| 204 |
},
|
| 205 |
'suno/bark': {
|
| 206 |
-
1: 'Speaker 3 (en)',
|
| 207 |
},
|
| 208 |
'amphion/Text-to-Speech': {
|
| 209 |
-
1: 'LikeManyWaters',
|
| 210 |
},
|
| 211 |
'LeeSangHoon/HierSpeech_TTS': {
|
| 212 |
1: DEFAULT_VOICE_SAMPLE, # voice sample
|
|
@@ -218,24 +219,27 @@ OVERRIDE_INPUTS = {
|
|
| 218 |
7: 1111,
|
| 219 |
},
|
| 220 |
'Manmay/tortoise-tts': {
|
| 221 |
-
1: None, # text-from-file;
|
| 222 |
-
2: 'angie',
|
| 223 |
3: None,
|
| 224 |
4: 'No',
|
| 225 |
},
|
| 226 |
'mrfakename/MeloTTS': {
|
| 227 |
-
|
| 228 |
-
|
|
|
|
| 229 |
3: 'EN', # language
|
| 230 |
},
|
| 231 |
'parler-tts/parler_tts': {
|
| 232 |
-
1: '
|
| 233 |
},
|
| 234 |
'parler-tts/parler-tts-expresso': {
|
| 235 |
-
1: '
|
| 236 |
},
|
| 237 |
}
|
| 238 |
|
|
|
|
|
|
|
| 239 |
SPACE_ID = os.getenv('SPACE_ID')
|
| 240 |
MAX_SAMPLE_TXT_LENGTH = 300
|
| 241 |
MIN_SAMPLE_TXT_LENGTH = 10
|
|
@@ -334,6 +338,7 @@ scheduler = CommitScheduler(
|
|
| 334 |
# Router API
|
| 335 |
####################################
|
| 336 |
# router = Client("TTS-AGI/tts-router", hf_token=hf_token)
|
|
|
|
| 337 |
####################################
|
| 338 |
# Gradio app
|
| 339 |
####################################
|
|
@@ -792,15 +797,15 @@ def synthandreturn(text):
|
|
| 792 |
pass
|
| 793 |
# Get two random models
|
| 794 |
# forced model: your TTS model versus The World!!!
|
| 795 |
-
mdl1 = 'Pendrokar/xVASynth'
|
| 796 |
vsModels = dict(AVAILABLE_MODELS)
|
| 797 |
-
del vsModels[mdl1]
|
| 798 |
# randomize position of the forced model
|
| 799 |
mdl2 = random.sample(list(vsModels.keys()), 1)
|
| 800 |
# forced random
|
| 801 |
-
mdl1, mdl2 = random.sample(list([mdl1, mdl2[0]]), 2)
|
| 802 |
# actual random
|
| 803 |
-
|
| 804 |
log_text(text)
|
| 805 |
print("[debug] Using", mdl1, mdl2)
|
| 806 |
def predict_and_update_result(text, model, result_storage):
|
|
@@ -812,7 +817,11 @@ def synthandreturn(text):
|
|
| 812 |
if model in AVAILABLE_MODELS:
|
| 813 |
if '/' in model:
|
| 814 |
# Use public HF Space
|
| 815 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
# assume the index is one of the first 9 return params
|
| 817 |
return_audio_index = int(HF_SPACES[model]['return_audio_index'])
|
| 818 |
endpoints = mdl_space.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
|
@@ -841,9 +850,10 @@ def synthandreturn(text):
|
|
| 841 |
|
| 842 |
# force text
|
| 843 |
space_inputs[HF_SPACES[model]['text_param_index']] = text
|
| 844 |
-
|
|
|
|
| 845 |
results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
|
| 846 |
-
|
| 847 |
# return path to audio
|
| 848 |
result = results[return_audio_index] if (not isinstance(results, str)) else results
|
| 849 |
else:
|
|
@@ -852,39 +862,37 @@ def synthandreturn(text):
|
|
| 852 |
else:
|
| 853 |
result = router.predict(text, model.lower(), api_name="/synthesize")
|
| 854 |
break
|
| 855 |
-
except:
|
|
|
|
| 856 |
attempt_count += 1
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
|
|
|
|
|
|
|
|
|
| 863 |
try:
|
| 864 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
| 865 |
audio = AudioSegment.from_file(result)
|
| 866 |
current_sr = audio.frame_rate
|
| 867 |
if current_sr > 24000:
|
| 868 |
-
print(
|
| 869 |
audio = audio.set_frame_rate(24000)
|
| 870 |
try:
|
| 871 |
-
print(
|
| 872 |
audio = match_target_amplitude(audio, -20)
|
| 873 |
except:
|
| 874 |
-
print(
|
| 875 |
audio.export(f.name, format="wav")
|
| 876 |
os.unlink(result)
|
| 877 |
result = f.name
|
| 878 |
except:
|
| 879 |
pass
|
| 880 |
if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
| 881 |
-
print(model)
|
| 882 |
-
print(f"Running model {model}")
|
| 883 |
result_storage[model] = result
|
| 884 |
-
# try:
|
| 885 |
-
# doloudnorm(result)
|
| 886 |
-
# except:
|
| 887 |
-
# pass
|
| 888 |
|
| 889 |
def _get_param_examples(parameters):
|
| 890 |
example_inputs = []
|
|
@@ -913,7 +921,7 @@ def synthandreturn(text):
|
|
| 913 |
try:
|
| 914 |
for key,value in OVERRIDE_INPUTS[modelname].items():
|
| 915 |
inputs[key] = value
|
| 916 |
-
print(f"Default inputs overridden
|
| 917 |
except:
|
| 918 |
pass
|
| 919 |
|
|
@@ -1104,3 +1112,4 @@ with gr.Blocks(theme=theme, css="footer {visibility: hidden}textbox{resize:none}
|
|
| 1104 |
|
| 1105 |
|
| 1106 |
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False)
|
|
|
|
|
|
| 45 |
# 'VoiceCraft 2.0': 'voicecraft',
|
| 46 |
# 'Parler TTS': 'parler'
|
| 47 |
|
| 48 |
+
# HF Gradio Spaces:
|
| 49 |
'coqui/xtts': 'coqui/xtts',
|
| 50 |
+
# 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # old gradio?
|
| 51 |
+
'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # 4.29.0
|
| 52 |
+
'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # 4.29.0
|
| 53 |
+
'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29.0
|
| 54 |
+
'Pendrokar/xVASynth': 'Pendrokar/xVASynth', # EN-GB 4.29.0 4.42.0
|
| 55 |
# 'coqui/CoquiTTS': 'coqui/CoquiTTS',
|
| 56 |
+
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # 4.29.0
|
| 57 |
|
| 58 |
# Parler
|
| 59 |
+
'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29.0 4.42.0
|
| 60 |
+
'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29.0 4.42.0
|
| 61 |
|
| 62 |
# TTS w issues
|
| 63 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
|
|
| 67 |
# 'styletts2/styletts2': '0#0', # API disabled
|
| 68 |
# 'Manmay/tortoise-tts': '/predict#0', # Cannot skip text-from-file parameter
|
| 69 |
# 'pytorch/Tacotron2': '0#0', # old gradio
|
| 70 |
+
# 'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # Error with EN # 4.29.0
|
| 71 |
# 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # old gradio - ValueError: Unsupported protocol: sse_v3
|
| 72 |
}
|
| 73 |
|
|
|
|
| 132 |
'mrfakename/MeloTTS': {
|
| 133 |
'name': 'mrfakename/MeloTTS',
|
| 134 |
'function': '/synthesize',
|
| 135 |
+
'text_param_index': 0,
|
| 136 |
'return_audio_index': 0,
|
| 137 |
},
|
| 138 |
|
|
|
|
| 200 |
3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
|
| 201 |
},
|
| 202 |
'Pendrokar/xVASynth': {
|
| 203 |
+
1: 'ccby_nvidia_hifi_92_F', #fine-tuned voice model name; #92 BRITISH
|
| 204 |
3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
|
| 205 |
},
|
| 206 |
'suno/bark': {
|
| 207 |
+
1: 'Speaker 3 (en)', # voice
|
| 208 |
},
|
| 209 |
'amphion/Text-to-Speech': {
|
| 210 |
+
1: 'LikeManyWaters', # voice
|
| 211 |
},
|
| 212 |
'LeeSangHoon/HierSpeech_TTS': {
|
| 213 |
1: DEFAULT_VOICE_SAMPLE, # voice sample
|
|
|
|
| 219 |
7: 1111,
|
| 220 |
},
|
| 221 |
'Manmay/tortoise-tts': {
|
| 222 |
+
1: None, # text-from-file; cannot skip and doesn't work without
|
| 223 |
+
2: 'angie', # voice
|
| 224 |
3: None,
|
| 225 |
4: 'No',
|
| 226 |
},
|
| 227 |
'mrfakename/MeloTTS': {
|
| 228 |
+
1: 'EN', # speaker
|
| 229 |
+
# 1: 'EN-US', # speaker
|
| 230 |
+
2: 1, # speed
|
| 231 |
3: 'EN', # language
|
| 232 |
},
|
| 233 |
'parler-tts/parler_tts': {
|
| 234 |
+
1: 'Elisabeth\'s voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.', # description/prompt
|
| 235 |
},
|
| 236 |
'parler-tts/parler-tts-expresso': {
|
| 237 |
+
1: 'Elisabeth\'s voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.', # description/prompt
|
| 238 |
},
|
| 239 |
}
|
| 240 |
|
| 241 |
+
hf_clients = {}
|
| 242 |
+
|
| 243 |
SPACE_ID = os.getenv('SPACE_ID')
|
| 244 |
MAX_SAMPLE_TXT_LENGTH = 300
|
| 245 |
MIN_SAMPLE_TXT_LENGTH = 10
|
|
|
|
| 338 |
# Router API
|
| 339 |
####################################
|
| 340 |
# router = Client("TTS-AGI/tts-router", hf_token=hf_token)
|
| 341 |
+
router = {}
|
| 342 |
####################################
|
| 343 |
# Gradio app
|
| 344 |
####################################
|
|
|
|
| 797 |
pass
|
| 798 |
# Get two random models
|
| 799 |
# forced model: your TTS model versus The World!!!
|
| 800 |
+
# mdl1 = 'Pendrokar/xVASynth'
|
| 801 |
vsModels = dict(AVAILABLE_MODELS)
|
| 802 |
+
# del vsModels[mdl1]
|
| 803 |
# randomize position of the forced model
|
| 804 |
mdl2 = random.sample(list(vsModels.keys()), 1)
|
| 805 |
# forced random
|
| 806 |
+
# mdl1, mdl2 = random.sample(list([mdl1, mdl2[0]]), 2)
|
| 807 |
# actual random
|
| 808 |
+
mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
|
| 809 |
log_text(text)
|
| 810 |
print("[debug] Using", mdl1, mdl2)
|
| 811 |
def predict_and_update_result(text, model, result_storage):
|
|
|
|
| 817 |
if model in AVAILABLE_MODELS:
|
| 818 |
if '/' in model:
|
| 819 |
# Use public HF Space
|
| 820 |
+
if (model not in hf_clients):
|
| 821 |
+
hf_clients[model] = Client(model, hf_token=hf_token)
|
| 822 |
+
mdl_space = hf_clients[model]
|
| 823 |
+
|
| 824 |
+
print(f"{model}: Fetching endpoints of HF Space")
|
| 825 |
# assume the index is one of the first 9 return params
|
| 826 |
return_audio_index = int(HF_SPACES[model]['return_audio_index'])
|
| 827 |
endpoints = mdl_space.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
|
|
|
| 850 |
|
| 851 |
# force text
|
| 852 |
space_inputs[HF_SPACES[model]['text_param_index']] = text
|
| 853 |
+
|
| 854 |
+
print(f"{model}: Sending request to HF Space")
|
| 855 |
results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
|
| 856 |
+
|
| 857 |
# return path to audio
|
| 858 |
result = results[return_audio_index] if (not isinstance(results, str)) else results
|
| 859 |
else:
|
|
|
|
| 862 |
else:
|
| 863 |
result = router.predict(text, model.lower(), api_name="/synthesize")
|
| 864 |
break
|
| 865 |
+
except Exception:
|
| 866 |
+
raise Exception
|
| 867 |
attempt_count += 1
|
| 868 |
+
print(f"{model}: Unable to call API (attempt: {attempt_count})")
|
| 869 |
+
# sleep for one second before trying again
|
| 870 |
+
time.sleep(1)
|
| 871 |
+
|
| 872 |
+
if attempt_count > 2:
|
| 873 |
+
raise gr.Error(f"{model}: Failed to call model")
|
| 874 |
+
else:
|
| 875 |
+
print('Done with', model)
|
| 876 |
+
|
| 877 |
try:
|
| 878 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
| 879 |
audio = AudioSegment.from_file(result)
|
| 880 |
current_sr = audio.frame_rate
|
| 881 |
if current_sr > 24000:
|
| 882 |
+
print(f"{model}: Resampling")
|
| 883 |
audio = audio.set_frame_rate(24000)
|
| 884 |
try:
|
| 885 |
+
print(f"{model}: Trying to normalize audio")
|
| 886 |
audio = match_target_amplitude(audio, -20)
|
| 887 |
except:
|
| 888 |
+
print(f"{model}: [WARN] Unable to normalize audio")
|
| 889 |
audio.export(f.name, format="wav")
|
| 890 |
os.unlink(result)
|
| 891 |
result = f.name
|
| 892 |
except:
|
| 893 |
pass
|
| 894 |
if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
|
|
|
|
|
|
| 895 |
result_storage[model] = result
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
|
| 897 |
def _get_param_examples(parameters):
|
| 898 |
example_inputs = []
|
|
|
|
| 921 |
try:
|
| 922 |
for key,value in OVERRIDE_INPUTS[modelname].items():
|
| 923 |
inputs[key] = value
|
| 924 |
+
print(f"{modelname}: Default inputs overridden")
|
| 925 |
except:
|
| 926 |
pass
|
| 927 |
|
|
|
|
| 1112 |
|
| 1113 |
|
| 1114 |
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False)
|
| 1115 |
+
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False, show_error=True)
|