SohomToom commited on
Commit
fbd01f5
·
verified ·
1 Parent(s): 6af1dd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -35
app.py CHANGED
@@ -1,13 +1,16 @@
1
  import os
2
- import uuid
3
- import time
4
  import torch
 
 
5
  import gradio as gr
6
- from melo.api import TTS
7
  from openvoice.api import ToneColorConverter
8
 
9
- # Set temporary cache locations for Hugging Face Spaces
10
  os.environ["TORCH_HOME"] = "/tmp/torch"
 
 
 
11
  os.environ["HF_HOME"] = "/tmp/huggingface"
12
  os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
13
  os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
@@ -15,53 +18,48 @@ os.environ["MPLCONFIGDIR"] = "/tmp"
15
  os.environ["XDG_CACHE_HOME"] = "/tmp"
16
  os.environ["XDG_CONFIG_HOME"] = "/tmp"
17
  os.environ["NUMBA_DISABLE_CACHE"] = "1"
18
- os.makedirs("/tmp/torch", exist_ok=True)
19
  os.makedirs("/tmp/huggingface", exist_ok=True)
20
  os.makedirs("/tmp/flagged", exist_ok=True)
21
 
22
- # Output folder
 
23
  output_dir = "/tmp/outputs"
24
  os.makedirs(output_dir, exist_ok=True)
25
 
26
- # Initialize tone converter
27
- ckpt_converter = "checkpoints/converter/config.json"
28
  tone_color_converter = ToneColorConverter(ckpt_converter)
29
 
30
- # Device setting
31
- device = "cuda" if torch.cuda.is_available() else "cpu"
32
 
33
  def clone_and_speak(text, speaker_wav):
34
  if not speaker_wav:
35
  return "Please upload a reference .wav file."
36
 
37
- base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
38
- tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
39
- final_output_path = f"{output_dir}/{base_name}_converted.wav"
40
-
41
- # Use English speaker model
42
- model = TTS(language="EN", device=device)
43
- speaker_ids = model.hps.data.spk2id
44
- default_speaker_id = next(iter(speaker_ids.values()))
45
-
46
- # Generate base TTS voice
47
- model.tts_to_file(text, default_speaker_id, tmp_melo_path)
48
 
49
- # Use speaker_wav as reference to extract style embedding
50
- from openvoice import se_extractor
51
- ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
52
 
53
- # Run the tone conversion
54
  tone_color_converter.convert(
55
- audio_src_path=tmp_melo_path,
56
- src_se=ref_se,
57
- tgt_se=ref_se,
58
- output_path=final_output_path,
59
- message="@HuggingFace",
 
 
 
60
  )
61
 
62
- return final_output_path
63
 
64
- # Gradio interface
65
  gr.Interface(
66
  fn=clone_and_speak,
67
  inputs=[
@@ -69,7 +67,7 @@ gr.Interface(
69
  gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
70
  ],
71
  outputs=gr.Audio(label="Synthesized Output"),
72
- flagging_dir="/tmp/flagged",
73
- title="Text to Voice using Melo TTS + OpenVoice",
74
- description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
75
  ).launch()
 
1
  import os
 
 
2
  import torch
3
+ import time
4
+ import uuid
5
  import gradio as gr
6
+ from openvoice import se_extractor
7
  from openvoice.api import ToneColorConverter
8
 
9
+ # Set writable cache directory for torch
10
  os.environ["TORCH_HOME"] = "/tmp/torch"
11
+ os.makedirs("/tmp/torch", exist_ok=True)
12
+
13
+ # Environment fixes for HF Spaces
14
  os.environ["HF_HOME"] = "/tmp/huggingface"
15
  os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
16
  os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
 
18
  os.environ["XDG_CACHE_HOME"] = "/tmp"
19
  os.environ["XDG_CONFIG_HOME"] = "/tmp"
20
  os.environ["NUMBA_DISABLE_CACHE"] = "1"
 
21
  os.makedirs("/tmp/huggingface", exist_ok=True)
22
  os.makedirs("/tmp/flagged", exist_ok=True)
23
 
24
+ # Set model paths
25
+ ckpt_converter = "checkpoints/converter/config.json"
26
  output_dir = "/tmp/outputs"
27
  os.makedirs(output_dir, exist_ok=True)
28
 
29
+ # Initialize OpenVoice converter
 
30
  tone_color_converter = ToneColorConverter(ckpt_converter)
31
 
32
+ # Speaker embedding cache
33
+ ref_speaker_embed = None
34
 
35
  def clone_and_speak(text, speaker_wav):
36
  if not speaker_wav:
37
  return "Please upload a reference .wav file."
38
 
39
+ # Generate a unique filename
40
+ timestamp = str(int(time.time()))
41
+ base_name = f"output_{timestamp}_{uuid.uuid4().hex[:6]}"
42
+ output_wav = os.path.join(output_dir, f"{base_name}.wav")
 
 
 
 
 
 
 
43
 
44
+ # Extract style from uploaded speaker voice
45
+ global ref_speaker_embed
46
+ ref_speaker_embed = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
47
 
48
+ # Generate speech using base model
49
  tone_color_converter.convert(
50
+ text=text,
51
+ speaker_id="openvoice",
52
+ language="en",
53
+ ref_speaker=speaker_wav,
54
+ ref_embed=ref_speaker_embed,
55
+ output_path=output_wav,
56
+ top_k=10,
57
+ temperature=0.3
58
  )
59
 
60
+ return output_wav
61
 
62
+ # Gradio interface (exposed as global `demo` for HF Spaces)
63
  gr.Interface(
64
  fn=clone_and_speak,
65
  inputs=[
 
67
  gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
68
  ],
69
  outputs=gr.Audio(label="Synthesized Output"),
70
+ flagging_dir="/tmp/flagged", # safe temporary dir
71
+ title="Text to Voice using OpenVoice",
72
+ description="Clone any voice (English) and generate speech using OpenVoice on CPU.",
73
  ).launch()