积极的屁孩 commited on
Commit
980462e
·
1 Parent(s): 6ac7b96

utilize gpu

Browse files
Files changed (1) hide show
  1. app.py +93 -94
app.py CHANGED
@@ -5,14 +5,12 @@ import site
5
  import json
6
  import torch
7
  import gradio as gr
 
8
  import torchaudio
9
  import numpy as np
10
  from huggingface_hub import snapshot_download, hf_hub_download
11
  import subprocess
12
  import re
13
- # 添加Huggingface spaces导入
14
- from huggingface_hub.spaces import Space
15
- import spaces
16
 
17
  def install_espeak():
18
  """检测并安装espeak-ng依赖"""
@@ -353,7 +351,6 @@ def get_pipeline(pipeline_type):
353
  return inference_pipeline
354
 
355
  # 实现VEVO功能函数
356
- @spaces.GPU
357
  def vevo_style(content_wav, style_wav):
358
  temp_content_path = "wav/temp_content.wav"
359
  temp_style_path = "wav/temp_style.wav"
@@ -436,7 +433,6 @@ def vevo_style(content_wav, style_wav):
436
  traceback.print_exc()
437
  raise e
438
 
439
- @spaces.GPU
440
  def vevo_timbre(content_wav, reference_wav):
441
  temp_content_path = "wav/temp_content.wav"
442
  temp_reference_path = "wav/temp_reference.wav"
@@ -530,7 +526,6 @@ def vevo_timbre(content_wav, reference_wav):
530
  traceback.print_exc()
531
  raise e
532
 
533
- @spaces.GPU
534
  def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
535
  temp_content_path = "wav/temp_content.wav"
536
  temp_style_path = "wav/temp_style.wav"
@@ -652,7 +647,6 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
652
  traceback.print_exc()
653
  raise e
654
 
655
- @spaces.GPU
656
  def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
657
  temp_ref_path = "wav/temp_ref.wav"
658
  temp_timbre_path = "wav/temp_timbre.wav"
@@ -756,93 +750,98 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
756
  raise e
757
 
758
  # 创建Gradio界面
759
- with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
760
- gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
761
- # 添加链接标签行
762
- with gr.Row(elem_id="links_row"):
763
- gr.HTML("""
764
- <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
765
- <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
766
- <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
767
- </a>
768
- <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
769
- <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
770
- </a>
771
- <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
772
- <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
773
- </a>
774
- <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
775
- <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
776
- </a>
777
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
  """)
779
 
780
- with gr.Tab("Vevo-Timbre"):
781
- gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
782
- with gr.Row():
783
- with gr.Column():
784
- timbre_content = gr.Audio(label="Source Audio", type="numpy")
785
- timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
786
- timbre_button = gr.Button("Generate")
787
- with gr.Column():
788
- timbre_output = gr.Audio(label="Result")
789
- timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
790
-
791
- with gr.Tab("Vevo-Style"):
792
- gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
793
- with gr.Row():
794
- with gr.Column():
795
- style_content = gr.Audio(label="Source Audio", type="numpy")
796
- style_reference = gr.Audio(label="Style Reference", type="numpy")
797
- style_button = gr.Button("Generate")
798
- with gr.Column():
799
- style_output = gr.Audio(label="Result")
800
- style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
801
-
802
- with gr.Tab("Vevo-Voice"):
803
- gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
804
- with gr.Row():
805
- with gr.Column():
806
- voice_content = gr.Audio(label="Source Audio", type="numpy")
807
- voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
808
- voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
809
- voice_button = gr.Button("Generate")
810
- with gr.Column():
811
- voice_output = gr.Audio(label="Result")
812
- voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
813
-
814
-
815
-
816
- with gr.Tab("Vevo-TTS"):
817
- gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
818
- with gr.Row():
819
- with gr.Column():
820
- tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
821
- tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
822
- tts_reference = gr.Audio(label="Style Reference", type="numpy")
823
- tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
824
- tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
825
- tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
826
- tts_button = gr.Button("Generate")
827
- with gr.Column():
828
- tts_output = gr.Audio(label="Result")
829
-
830
- tts_button.click(
831
- vevo_tts,
832
- inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
833
- outputs=tts_output
834
- )
835
-
836
- gr.Markdown("""
837
- ## About VEVO
838
- VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
839
- 1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
840
- 2. **Vevo-Timbre**: Maintains style but transfers timbre
841
- 3. **Vevo-Voice**: Transfers both style and timbre with separate references
842
- 4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
843
-
844
- For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
845
- """)
846
 
847
- # 启动应用
848
- demo.launch()
 
5
  import json
6
  import torch
7
  import gradio as gr
8
+ import gradio.spaces as spaces
9
  import torchaudio
10
  import numpy as np
11
  from huggingface_hub import snapshot_download, hf_hub_download
12
  import subprocess
13
  import re
 
 
 
14
 
15
  def install_espeak():
16
  """检测并安装espeak-ng依赖"""
 
351
  return inference_pipeline
352
 
353
  # 实现VEVO功能函数
 
354
  def vevo_style(content_wav, style_wav):
355
  temp_content_path = "wav/temp_content.wav"
356
  temp_style_path = "wav/temp_style.wav"
 
433
  traceback.print_exc()
434
  raise e
435
 
 
436
  def vevo_timbre(content_wav, reference_wav):
437
  temp_content_path = "wav/temp_content.wav"
438
  temp_reference_path = "wav/temp_reference.wav"
 
526
  traceback.print_exc()
527
  raise e
528
 
 
529
  def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
530
  temp_content_path = "wav/temp_content.wav"
531
  temp_style_path = "wav/temp_style.wav"
 
647
  traceback.print_exc()
648
  raise e
649
 
 
650
  def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
651
  temp_ref_path = "wav/temp_ref.wav"
652
  temp_timbre_path = "wav/temp_timbre.wav"
 
750
  raise e
751
 
752
  # 创建Gradio界面
753
+ @spaces.GPU
754
+ def run_app():
755
+ with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
756
+ gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
757
+ # 添加链接标签行
758
+ with gr.Row(elem_id="links_row"):
759
+ gr.HTML("""
760
+ <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
761
+ <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
762
+ <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
763
+ </a>
764
+ <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
765
+ <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
766
+ </a>
767
+ <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
768
+ <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
769
+ </a>
770
+ <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
771
+ <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
772
+ </a>
773
+ </div>
774
+ """)
775
+
776
+ with gr.Tab("Vevo-Timbre"):
777
+ gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
778
+ with gr.Row():
779
+ with gr.Column():
780
+ timbre_content = gr.Audio(label="Source Audio", type="numpy")
781
+ timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
782
+ timbre_button = gr.Button("Generate")
783
+ with gr.Column():
784
+ timbre_output = gr.Audio(label="Result")
785
+ timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
786
+
787
+ with gr.Tab("Vevo-Style"):
788
+ gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
789
+ with gr.Row():
790
+ with gr.Column():
791
+ style_content = gr.Audio(label="Source Audio", type="numpy")
792
+ style_reference = gr.Audio(label="Style Reference", type="numpy")
793
+ style_button = gr.Button("Generate")
794
+ with gr.Column():
795
+ style_output = gr.Audio(label="Result")
796
+ style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
797
+
798
+ with gr.Tab("Vevo-Voice"):
799
+ gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
800
+ with gr.Row():
801
+ with gr.Column():
802
+ voice_content = gr.Audio(label="Source Audio", type="numpy")
803
+ voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
804
+ voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
805
+ voice_button = gr.Button("Generate")
806
+ with gr.Column():
807
+ voice_output = gr.Audio(label="Result")
808
+ voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
809
+
810
+
811
+
812
+ with gr.Tab("Vevo-TTS"):
813
+ gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
814
+ with gr.Row():
815
+ with gr.Column():
816
+ tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
817
+ tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
818
+ tts_reference = gr.Audio(label="Style Reference", type="numpy")
819
+ tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
820
+ tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
821
+ tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
822
+ tts_button = gr.Button("Generate")
823
+ with gr.Column():
824
+ tts_output = gr.Audio(label="Result")
825
+
826
+ tts_button.click(
827
+ vevo_tts,
828
+ inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
829
+ outputs=tts_output
830
+ )
831
+
832
+ gr.Markdown("""
833
+ ## About VEVO
834
+ VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
835
+ 1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
836
+ 2. **Vevo-Timbre**: Maintains style but transfers timbre
837
+ 3. **Vevo-Voice**: Transfers both style and timbre with separate references
838
+ 4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
839
+
840
+ For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
841
  """)
842
 
843
+ # 启动应用
844
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845
 
846
+ # Run the app
847
+ run_app()