Vevo

Sleeping

App Files Files Community

积极的屁孩 commited on Apr 19

Commit

980462e

1 Parent(s): 6ac7b96

utilize gpu

Browse files

Files changed (1) hide show

app.py +93 -94

app.py CHANGED Viewed

@@ -5,14 +5,12 @@ import site
 import json
 import torch
 import gradio as gr
 import torchaudio
 import numpy as np
 from huggingface_hub import snapshot_download, hf_hub_download
 import subprocess
 import re
-# 添加Huggingface spaces导入
-from huggingface_hub.spaces import Space
-import spaces
 def install_espeak():
     """检测并安装espeak-ng依赖"""
@@ -353,7 +351,6 @@ def get_pipeline(pipeline_type):
     return inference_pipeline
 # 实现VEVO功能函数
-@spaces.GPU
 def vevo_style(content_wav, style_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
@@ -436,7 +433,6 @@ def vevo_style(content_wav, style_wav):
         traceback.print_exc()
         raise e
-@spaces.GPU
 def vevo_timbre(content_wav, reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_reference_path = "wav/temp_reference.wav"
@@ -530,7 +526,6 @@ def vevo_timbre(content_wav, reference_wav):
         traceback.print_exc()
         raise e
-@spaces.GPU
 def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
@@ -652,7 +647,6 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
         traceback.print_exc()
         raise e
-@spaces.GPU
 def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
     temp_ref_path = "wav/temp_ref.wav"
     temp_timbre_path = "wav/temp_timbre.wav"
@@ -756,93 +750,98 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
         raise e
 # 创建Gradio界面
-with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
-    gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
-    # 添加链接标签行
-    with gr.Row(elem_id="links_row"):
-        gr.HTML("""
-        <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
-            <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
-                <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
-            </a>
-            <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
-                <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
-            </a>
-            <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
-                <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
-            </a>
-            <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
-                <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
-            </a>
-        </div>
         """)
-    with gr.Tab("Vevo-Timbre"):
-        gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
-        with gr.Row():
-            with gr.Column():
-                timbre_content = gr.Audio(label="Source Audio", type="numpy")
-                timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
-                timbre_button = gr.Button("Generate")
-            with gr.Column():
-                timbre_output = gr.Audio(label="Result")
-        timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
-    with gr.Tab("Vevo-Style"):
-        gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
-        with gr.Row():
-            with gr.Column():
-                style_content = gr.Audio(label="Source Audio", type="numpy")
-                style_reference = gr.Audio(label="Style Reference", type="numpy")
-                style_button = gr.Button("Generate")
-            with gr.Column():
-                style_output = gr.Audio(label="Result")
-        style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
-    with gr.Tab("Vevo-Voice"):
-        gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
-        with gr.Row():
-            with gr.Column():
-                voice_content = gr.Audio(label="Source Audio", type="numpy")
-                voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
-                voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
-                voice_button = gr.Button("Generate")
-            with gr.Column():
-                voice_output = gr.Audio(label="Result")
-        voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
-    with gr.Tab("Vevo-TTS"):
-        gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
-        with gr.Row():
-            with gr.Column():
-                tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
-                tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
-                tts_reference = gr.Audio(label="Style Reference", type="numpy")
-                tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
-                tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
-                tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
-                tts_button = gr.Button("Generate")
-            with gr.Column():
-                tts_output = gr.Audio(label="Result")
-        tts_button.click(
-            vevo_tts,
-            inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
-            outputs=tts_output
-        )
-    gr.Markdown("""
-    ## About VEVO
-    VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
-    1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
-    2. **Vevo-Timbre**: Maintains style but transfers timbre
-    3. **Vevo-Voice**: Transfers both style and timbre with separate references
-    4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
-    For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
-    """)
-# 启动应用
-demo.launch()

 import json
 import torch
 import gradio as gr
+import gradio.spaces as spaces
 import torchaudio
 import numpy as np
 from huggingface_hub import snapshot_download, hf_hub_download
 import subprocess
 import re
 def install_espeak():
     """检测并安装espeak-ng依赖"""
     return inference_pipeline
 # 实现VEVO功能函数
 def vevo_style(content_wav, style_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
         traceback.print_exc()
         raise e
 def vevo_timbre(content_wav, reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_reference_path = "wav/temp_reference.wav"
         traceback.print_exc()
         raise e
 def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
         traceback.print_exc()
         raise e
 def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
     temp_ref_path = "wav/temp_ref.wav"
     temp_timbre_path = "wav/temp_timbre.wav"
         raise e
 # 创建Gradio界面
+@spaces.GPU
+def run_app():
+    with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
+        gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
+        # 添加链接标签行
+        with gr.Row(elem_id="links_row"):
+            gr.HTML("""
+            <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
+                <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
+                    <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
+                </a>
+                <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
+                    <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
+                </a>
+                <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
+                    <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
+                </a>
+                <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
+                    <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
+                </a>
+            </div>
+            """)
+        with gr.Tab("Vevo-Timbre"):
+            gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
+            with gr.Row():
+                with gr.Column():
+                    timbre_content = gr.Audio(label="Source Audio", type="numpy")
+                    timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                    timbre_button = gr.Button("Generate")
+                with gr.Column():
+                    timbre_output = gr.Audio(label="Result")
+            timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
+        with gr.Tab("Vevo-Style"):
+            gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
+            with gr.Row():
+                with gr.Column():
+                    style_content = gr.Audio(label="Source Audio", type="numpy")
+                    style_reference = gr.Audio(label="Style Reference", type="numpy")
+                    style_button = gr.Button("Generate")
+                with gr.Column():
+                    style_output = gr.Audio(label="Result")
+            style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
+        with gr.Tab("Vevo-Voice"):
+            gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
+            with gr.Row():
+                with gr.Column():
+                    voice_content = gr.Audio(label="Source Audio", type="numpy")
+                    voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
+                    voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                    voice_button = gr.Button("Generate")
+                with gr.Column():
+                    voice_output = gr.Audio(label="Result")
+            voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
+        with gr.Tab("Vevo-TTS"):
+            gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
+            with gr.Row():
+                with gr.Column():
+                    tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
+                    tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
+                    tts_reference = gr.Audio(label="Style Reference", type="numpy")
+                    tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
+                    tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
+                    tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                    tts_button = gr.Button("Generate")
+                with gr.Column():
+                    tts_output = gr.Audio(label="Result")
+            tts_button.click(
+                vevo_tts,
+                inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
+                outputs=tts_output
+            )
+        gr.Markdown("""
+        ## About VEVO
+        VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
+        1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
+        2. **Vevo-Timbre**: Maintains style but transfers timbre
+        3. **Vevo-Voice**: Transfers both style and timbre with separate references
+        4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
+        For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
         """)
+    # 启动应用
+    demo.launch()
+# Run the app
+run_app()