SohomToom commited on
Commit
ab0bdb4
·
verified ·
1 Parent(s): 22f0c8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -60
app.py CHANGED
@@ -1,58 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
- import uuid
3
  import time
4
- import torch
5
  import gradio as gr
6
- os.environ["NUMBA_DISABLE_CACHE"] = "1"
7
- import mecab_patch
8
- import english_patch
9
- from melo.api import TTS
10
- from openvoice.api import ToneColorConverter
11
-
12
- # Set temporary cache locations for Hugging Face Spaces
13
- os.environ["TORCH_HOME"] = "/tmp/torch"
14
- os.environ["HF_HOME"] = "/tmp/huggingface"
15
- os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
16
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
17
- os.environ["MPLCONFIGDIR"] = "/tmp"
18
- os.environ["XDG_CACHE_HOME"] = "/tmp"
19
- os.environ["XDG_CONFIG_HOME"] = "/tmp"
20
- os.environ["NUMBA_DISABLE_CACHE"] = "1"
21
- os.makedirs("/tmp/torch", exist_ok=True)
22
- os.makedirs("/tmp/huggingface", exist_ok=True)
23
- os.makedirs("/tmp/flagged", exist_ok=True)
24
-
25
- # Output folder
26
- output_dir = "/tmp/outputs"
27
- os.makedirs(output_dir, exist_ok=True)
28
 
29
- # Initialize tone converter
30
- ckpt_converter = "checkpoints/converter/config.json"
31
- tone_color_converter = ToneColorConverter(ckpt_converter)
32
 
33
- # Device setting
34
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
35
 
36
  def clone_and_speak(text, speaker_wav):
37
  if not speaker_wav:
38
  return "Please upload a reference .wav file."
39
 
40
- # import melo.text.english as english
41
- # original_g2p = english.g2p
42
-
43
- # def patched_g2p(text):
44
- # phones, tones, word2ph = original_g2p(text)
45
- # # Fix: wrap ints in list to avoid TypeError
46
- # word2ph_fixed = []
47
- # for item in word2ph:
48
- # if isinstance(item, int):
49
- # word2ph_fixed.append([item])
50
- # else:
51
- # word2ph_fixed.append(item)
52
- # return phones, tones, word2ph_fixed
53
-
54
- # english.g2p = patched_g2p
55
-
56
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
57
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
58
  final_output_path = f"{output_dir}/{base_name}_converted.wav"
@@ -63,33 +129,32 @@ def clone_and_speak(text, speaker_wav):
63
  default_speaker_id = next(iter(speaker_ids.values()))
64
 
65
  # Generate base TTS voice
66
- speed = 1.0
67
- model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
68
 
69
- # Use speaker_wav as reference to extract style embedding
70
- from openvoice import se_extractor
71
  ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
72
 
73
- # Run the tone conversion
74
  tone_color_converter.convert(
75
  audio_src_path=tmp_melo_path,
76
  src_se=ref_se,
77
  tgt_se=ref_se,
78
  output_path=final_output_path,
79
- message="@HuggingFace",
80
  )
81
 
82
  return final_output_path
83
 
84
- # Gradio interface
85
- gr.Interface(
86
  fn=clone_and_speak,
87
  inputs=[
88
- gr.Textbox(label="Enter Text"),
89
- gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
90
  ],
91
- outputs=gr.Audio(label="Synthesized Output"),
92
- flagging_dir="/tmp/flagged",
93
- title="Text to Voice using Melo TTS + OpenVoice",
94
- description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
95
- ).launch()
 
 
1
+ # import os
2
+ # import uuid
3
+ # import time
4
+ # import torch
5
+ # import gradio as gr
6
+ # os.environ["NUMBA_DISABLE_CACHE"] = "1"
7
+ # import mecab_patch
8
+ # import english_patch
9
+ # from melo.api import TTS
10
+ # from openvoice.api import ToneColorConverter
11
+
12
+ # # Set temporary cache locations for Hugging Face Spaces
13
+ # os.environ["TORCH_HOME"] = "/tmp/torch"
14
+ # os.environ["HF_HOME"] = "/tmp/huggingface"
15
+ # os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
16
+ # os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
17
+ # os.environ["MPLCONFIGDIR"] = "/tmp"
18
+ # os.environ["XDG_CACHE_HOME"] = "/tmp"
19
+ # os.environ["XDG_CONFIG_HOME"] = "/tmp"
20
+ # os.environ["NUMBA_DISABLE_CACHE"] = "1"
21
+ # os.makedirs("/tmp/torch", exist_ok=True)
22
+ # os.makedirs("/tmp/huggingface", exist_ok=True)
23
+ # os.makedirs("/tmp/flagged", exist_ok=True)
24
+
25
+ # # Output folder
26
+ # output_dir = "/tmp/outputs"
27
+ # os.makedirs(output_dir, exist_ok=True)
28
+
29
+ # # Initialize tone converter
30
+ # ckpt_converter = "checkpoints/converter/config.json"
31
+ # tone_color_converter = ToneColorConverter(ckpt_converter)
32
+
33
+ # # Device setting
34
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
35
+
36
+ # def clone_and_speak(text, speaker_wav):
37
+ # if not speaker_wav:
38
+ # return "Please upload a reference .wav file."
39
+
40
+ # # import melo.text.english as english
41
+ # # original_g2p = english.g2p
42
+
43
+ # # def patched_g2p(text):
44
+ # # phones, tones, word2ph = original_g2p(text)
45
+ # # # Fix: wrap ints in list to avoid TypeError
46
+ # # word2ph_fixed = []
47
+ # # for item in word2ph:
48
+ # # if isinstance(item, int):
49
+ # # word2ph_fixed.append([item])
50
+ # # else:
51
+ # # word2ph_fixed.append(item)
52
+ # # return phones, tones, word2ph_fixed
53
+
54
+ # # english.g2p = patched_g2p
55
+
56
+ # base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
57
+ # tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
58
+ # final_output_path = f"{output_dir}/{base_name}_converted.wav"
59
+
60
+ # # Use English speaker model
61
+ # model = TTS(language="EN", device=device)
62
+ # speaker_ids = model.hps.data.spk2id
63
+ # default_speaker_id = next(iter(speaker_ids.values()))
64
+
65
+ # # Generate base TTS voice
66
+ # speed = 1.0
67
+ # model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
68
+
69
+ # # Use speaker_wav as reference to extract style embedding
70
+ # from openvoice import se_extractor
71
+ # ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
72
+
73
+ # # Run the tone conversion
74
+ # tone_color_converter.convert(
75
+ # audio_src_path=tmp_melo_path,
76
+ # src_se=ref_se,
77
+ # tgt_se=ref_se,
78
+ # output_path=final_output_path,
79
+ # message="@HuggingFace",
80
+ # )
81
+
82
+ # return final_output_path
83
+
84
+ # # Gradio interface
85
+ # gr.Interface(
86
+ # fn=clone_and_speak,
87
+ # inputs=[
88
+ # gr.Textbox(label="Enter Text"),
89
+ # gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
90
+ # ],
91
+ # outputs=gr.Audio(label="Synthesized Output"),
92
+ # flagging_dir="/tmp/flagged",
93
+ # title="Text to Voice using Melo TTS + OpenVoice",
94
+ # description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
95
+ # ).launch()
96
+
97
+
98
  import os
 
99
  import time
100
+ import uuid
101
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ from TTS.api import TTS
104
+ from openvoice import se_extractor, ToneColorConverter
 
105
 
106
+ # Import your local english.py logic
107
+ from meloTTS import english
108
+
109
+ # Paths
110
+ device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
111
+ output_dir = "outputs"
112
+ os.makedirs(output_dir, exist_ok=True)
113
+
114
+ # Load OpenVoice tone converter
115
+ tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
116
+ tone_color_converter.load_model()
117
 
118
  def clone_and_speak(text, speaker_wav):
119
  if not speaker_wav:
120
  return "Please upload a reference .wav file."
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
123
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
124
  final_output_path = f"{output_dir}/{base_name}_converted.wav"
 
129
  default_speaker_id = next(iter(speaker_ids.values()))
130
 
131
  # Generate base TTS voice
132
+ model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)
 
133
 
134
+ # Extract style embedding
 
135
  ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
136
 
137
+ # Convert tone
138
  tone_color_converter.convert(
139
  audio_src_path=tmp_melo_path,
140
  src_se=ref_se,
141
  tgt_se=ref_se,
142
  output_path=final_output_path,
143
+ message="@HuggingFace"
144
  )
145
 
146
  return final_output_path
147
 
148
+ # Gradio Interface
149
+ demo = gr.Interface(
150
  fn=clone_and_speak,
151
  inputs=[
152
+ gr.Textbox(label="Text to Synthesize"),
153
+ gr.Audio(label="Reference Voice (WAV)", type="filepath")
154
  ],
155
+ outputs=gr.Audio(label="Cloned Voice Output"),
156
+ title="Voice Cloner with MeloTTS + OpenVoice"
157
+ )
158
+
159
+ if __name__ == "__main__":
160
+ demo.launch()