SohomToom commited on
Commit
ed1a5ad
·
verified ·
1 Parent(s): 428e60f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -33
app.py CHANGED
@@ -1,16 +1,13 @@
1
  import os
2
- import torch
3
- import time
4
  import uuid
 
 
5
  import gradio as gr
6
- from openvoice import se_extractor
7
  from openvoice.api import ToneColorConverter
8
 
9
- # Set writable cache directory for torch
10
  os.environ["TORCH_HOME"] = "/tmp/torch"
11
- os.makedirs("/tmp/torch", exist_ok=True)
12
-
13
- # Environment fixes for HF Spaces
14
  os.environ["HF_HOME"] = "/tmp/huggingface"
15
  os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
16
  os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
@@ -18,48 +15,53 @@ os.environ["MPLCONFIGDIR"] = "/tmp"
18
  os.environ["XDG_CACHE_HOME"] = "/tmp"
19
  os.environ["XDG_CONFIG_HOME"] = "/tmp"
20
  os.environ["NUMBA_DISABLE_CACHE"] = "1"
 
21
  os.makedirs("/tmp/huggingface", exist_ok=True)
22
  os.makedirs("/tmp/flagged", exist_ok=True)
23
 
24
- # Set model paths
25
- ckpt_converter = "checkpoints/converter/config.json"
26
  output_dir = "/tmp/outputs"
27
  os.makedirs(output_dir, exist_ok=True)
28
 
29
- # Initialize OpenVoice converter
 
30
  tone_color_converter = ToneColorConverter(ckpt_converter)
31
 
32
- # Speaker embedding cache
33
- ref_speaker_embed = None
34
 
35
  def clone_and_speak(text, speaker_wav):
36
  if not speaker_wav:
37
  return "Please upload a reference .wav file."
38
 
39
- # Generate a unique filename
40
- timestamp = str(int(time.time()))
41
- base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
42
- output_wav = os.path.join(output_dir, f"{base_name}.wav")
 
 
 
 
 
 
 
43
 
44
- # Extract style from uploaded speaker voice
45
- global ref_speaker_embed
46
- ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
47
 
48
- # Generate speech using base model
49
  tone_color_converter.convert(
50
- text=text,
51
- speaker_id="openvoice",
52
- language="en",
53
- ref_speaker=speaker_wav,
54
- ref_embed=ref_speaker_embed,
55
- output_path=output_wav,
56
- top_k=10,
57
- temperature=0.3
58
  )
59
 
60
- return output_wav
61
 
62
- # Gradio interface (exposed as global `demo` for HF Spaces)
63
  gr.Interface(
64
  fn=clone_and_speak,
65
  inputs=[
@@ -67,7 +69,7 @@ gr.Interface(
67
  gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
68
  ],
69
  outputs=gr.Audio(label="Synthesized Output"),
70
- flagging_dir="/tmp/flagged", # safe temporary dir
71
- title="Text to Voice using OpenVoice",
72
- description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
73
  ).launch()
 
1
  import os
 
 
2
  import uuid
3
+ import time
4
+ import torch
5
  import gradio as gr
6
+ from melo.api import TTS
7
  from openvoice.api import ToneColorConverter
8
 
9
+ # Set temporary cache locations for Hugging Face Spaces
10
  os.environ["TORCH_HOME"] = "/tmp/torch"
 
 
 
11
  os.environ["HF_HOME"] = "/tmp/huggingface"
12
  os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
13
  os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
 
15
  os.environ["XDG_CACHE_HOME"] = "/tmp"
16
  os.environ["XDG_CONFIG_HOME"] = "/tmp"
17
  os.environ["NUMBA_DISABLE_CACHE"] = "1"
18
+ os.makedirs("/tmp/torch", exist_ok=True)
19
  os.makedirs("/tmp/huggingface", exist_ok=True)
20
  os.makedirs("/tmp/flagged", exist_ok=True)
21
 
22
+ # Output folder
 
23
  output_dir = "/tmp/outputs"
24
  os.makedirs(output_dir, exist_ok=True)
25
 
26
+ # Initialize tone converter
27
+ ckpt_converter = "checkpoints/converter/config.json"
28
  tone_color_converter = ToneColorConverter(ckpt_converter)
29
 
30
+ # Device setting
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
 
33
  def clone_and_speak(text, speaker_wav):
34
  if not speaker_wav:
35
  return "Please upload a reference .wav file."
36
 
37
+ base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
38
+ tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
39
+ final_output_path = f"{output_dir}/{base_name}_converted.wav"
40
+
41
+ # Use English speaker model
42
+ model = TTS(language="EN", device=device)
43
+ speaker_ids = model.hps.data.spk2id
44
+ default_speaker_id = next(iter(speaker_ids.values()))
45
+
46
+ # Generate base TTS voice
47
+ model.tts_to_file(text, default_speaker_id, tmp_melo_path,language="en")
48
 
49
+ # Use speaker_wav as reference to extract style embedding
50
+ from openvoice import se_extractor
51
+ ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
52
 
53
+ # Run the tone conversion
54
  tone_color_converter.convert(
55
+ audio_src_path=tmp_melo_path,
56
+ src_se=ref_se,
57
+ tgt_se=ref_se,
58
+ output_path=final_output_path,
59
+ message="@HuggingFace",
 
 
 
60
  )
61
 
62
+ return final_output_path
63
 
64
+ # Gradio interface
65
  gr.Interface(
66
  fn=clone_and_speak,
67
  inputs=[
 
69
  gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
70
  ],
71
  outputs=gr.Audio(label="Synthesized Output"),
72
+ flagging_dir="/tmp/flagged",
73
+ title="Text to Voice using Melo TTS + OpenVoice",
74
+ description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
75
  ).launch()