import os import tempfile import gradio as gr import torch import torchaudio from loguru import logger from typing import Optional, Tuple import random import numpy as np from huggingface_hub import snapshot_download import shutil from hunyuanvideo_foley.utils.model_utils import load_model from hunyuanvideo_foley.utils.feature_utils import feature_process from hunyuanvideo_foley.utils.model_utils import denoise_process from hunyuanvideo_foley.utils.media_utils import merge_audio_video # Global variables for model storage model_dict = None cfg = None device = None # need to modify the model path MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", "./pretrained_models/") CONFIG_PATH = "configs/hunyuanvideo-foley-xxl.yaml" def download_model_from_hf(repo_id: str = "tencent/HunyuanVideo-Foley", local_dir: str = "./pretrained_models") -> str: """从HuggingFace自动下载模型到本地目录""" try: logger.info(f"开始从HuggingFace下载模型:{repo_id}") logger.info(f"下载目标目录:{local_dir}") # 确保本地目录存在 os.makedirs(local_dir, exist_ok=True) # 下载整个仓库 snapshot_download( repo_id=repo_id, local_dir=local_dir, resume_download=True, # 支持断点续传 local_files_only=False, # 允许从网络下载 ) logger.info(f"✅ 模型下载成功!保存在:{local_dir}") return f"✅ 模型从 {repo_id} 下载成功!" except Exception as e: error_msg = f"❌ 模型下载失败:{str(e)}" logger.error(error_msg) return error_msg def setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device: """Setup computing device""" if device_str == "auto": if torch.cuda.is_available(): device = torch.device(f"cuda:{gpu_id}") logger.info(f"Using CUDA device: {device}") elif torch.backends.mps.is_available(): device = torch.device("mps") logger.info("Using MPS device") else: device = torch.device("cpu") logger.info("Using CPU device") else: if device_str == "cuda": device = torch.device(f"cuda:{gpu_id}") else: device = torch.device(device_str) logger.info(f"Using specified device: {device}") return device def auto_load_models() -> str: """Automatically load preset models""" global model_dict, cfg, device try: # 如果模型路径不存在,尝试从HuggingFace下载 if not os.path.exists(MODEL_PATH): logger.info(f"模型路径 {MODEL_PATH} 不存在,开始从HuggingFace下载...") download_result = download_model_from_hf(local_dir=MODEL_PATH.rstrip('/')) if "失败" in download_result: return download_result # 如果配置文件不存在,也尝试从HuggingFace下载 if not os.path.exists(CONFIG_PATH): logger.info(f"配置文件 {CONFIG_PATH} 不存在,尝试从HuggingFace下载...") # 如果是从pretrained_models/配置路径,也尝试下载 if CONFIG_PATH.startswith("configs/"): config_dir = os.path.dirname(CONFIG_PATH) if not os.path.exists(config_dir): download_result = download_model_from_hf(local_dir="./") if "失败" in download_result: return download_result # 最后检查配置文件是否存在 if not os.path.exists(CONFIG_PATH): return f"❌ 配置文件未找到: {CONFIG_PATH}" # Use GPU by default device = setup_device("auto", 0) # Load model logger.info("正在加载模型...") logger.info(f"模型路径: {MODEL_PATH}") logger.info(f"配置路径: {CONFIG_PATH}") model_dict, cfg = load_model(MODEL_PATH, CONFIG_PATH, device) logger.info("✅ 模型加载成功!") return "✅ 模型加载成功!" except Exception as e: logger.error(f"模型加载失败: {str(e)}") return f"❌ 模型加载失败: {str(e)}" def infer_single_video( video_file, text_prompt: str, guidance_scale: float = 4.5, num_inference_steps: int = 50, sample_nums: int = 1 ) -> Tuple[list, str]: """Single video inference""" global model_dict, cfg, device if model_dict is None or cfg is None: return [], "❌ Please load the model first!" if video_file is None: return [], "❌ Please upload a video file!" # Allow empty text prompt, use empty string if no prompt provided if text_prompt is None: text_prompt = "" text_prompt = text_prompt.strip() try: logger.info(f"Processing video: {video_file}") logger.info(f"Text prompt: {text_prompt}") # Feature processing visual_feats, text_feats, audio_len_in_s = feature_process( video_file, text_prompt, model_dict, cfg ) # Denoising process to generate multiple audio samples # Note: The model now generates sample_nums audio samples per inference # The denoise_process function returns audio with shape [batch_size, channels, samples] logger.info(f"Generating {sample_nums} audio samples...") audio, sample_rate = denoise_process( visual_feats, text_feats, audio_len_in_s, model_dict, cfg, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, batch_size=sample_nums ) # Create temporary files to save results temp_dir = tempfile.mkdtemp() video_outputs = [] # Process each generated audio sample for i in range(sample_nums): # Save audio file audio_output = os.path.join(temp_dir, f"generated_audio_{i+1}.wav") torchaudio.save(audio_output, audio[i], sample_rate) # Merge video and audio video_output = os.path.join(temp_dir, f"video_with_audio_{i+1}.mp4") merge_audio_video(audio_output, video_file, video_output) video_outputs.append(video_output) logger.info(f"Inference completed! Generated {sample_nums} samples.") return video_outputs, f"✅ Generated {sample_nums} audio sample(s) successfully!" except Exception as e: logger.error(f"Inference failed: {str(e)}") return [], f"❌ Inference failed: {str(e)}" def update_video_outputs(video_list, status_msg): """Update video outputs based on the number of generated samples""" # Initialize all outputs as None outputs = [None] * 6 # Set values based on generated videos for i, video_path in enumerate(video_list[:6]): # Max 6 samples outputs[i] = video_path # Return all outputs plus status message return tuple(outputs + [status_msg]) def create_gradio_interface(): """Create Gradio interface""" # Custom CSS for beautiful interface with better contrast css = """ .gradio-container { font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); min-height: 100vh; } .main-header { text-align: center; padding: 2rem 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; box-shadow: 0 8px 32px rgba(0,0,0,0.15); } .main-header h1 { color: white; font-size: 3rem; font-weight: 700; margin-bottom: 0.5rem; text-shadow: 0 2px 10px rgba(0,0,0,0.3); } .main-header p { color: rgba(255, 255, 255, 0.95); font-size: 1.2rem; font-weight: 300; } .status-card { background: white; border-radius: 15px; padding: 1rem; margin-bottom: 1.5rem; border: 1px solid #e1e5e9; box-shadow: 0 4px 20px rgba(0,0,0,0.08); } .status-card label { color: #2d3748 !important; font-weight: 600 !important; } .usage-guide h3 { color: #2d3748 !important; font-weight: 600 !important; margin-bottom: 0.5rem !important; } .usage-guide p { color: #4a5568 !important; font-size: 1rem !important; line-height: 1.6 !important; margin: 0.5rem 0 !important; } .usage-guide strong { color: #1a202c !important; font-weight: 700 !important; } .usage-guide em { color: #1a202c !important; font-weight: 700 !important; font-style: normal !important; } .main-interface { margin-bottom: 2rem; } .input-section { background: white; border-radius: 20px; padding: 2rem; margin-right: 1rem; box-shadow: 0 8px 32px rgba(0,0,0,0.1); border: 1px solid #e1e5e9; } .input-section h3 { color: #2d3748 !important; font-weight: 600 !important; margin-bottom: 1rem !important; } .input-section label { color: #4a5568 !important; font-weight: 500 !important; } .output-section { background: white; border-radius: 20px; padding: 2rem; margin-left: 1rem; box-shadow: 0 8px 32px rgba(0,0,0,0.1); border: 1px solid #e1e5e9; } .output-section h3 { color: #2d3748 !important; font-weight: 600 !important; margin-bottom: 1rem !important; } .output-section label { color: #4a5568 !important; font-weight: 500 !important; } .examples-section h3 { color: #2d3748 !important; font-weight: 600 !important; margin-bottom: 1.5rem !important; } .generate-btn { background: linear-gradient(45deg, #667eea, #764ba2) !important; border: none !important; color: white !important; font-weight: 600 !important; font-size: 1.1rem !important; padding: 12px 30px !important; border-radius: 25px !important; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important; transition: all 0.3s ease !important; } .generate-btn:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important; } .examples-section { background: white; border-radius: 20px; padding: 2rem; margin-top: 2rem; box-shadow: 0 8px 32px rgba(0,0,0,0.1); border: 1px solid #e1e5e9; } .examples-section p { color: #4a5568 !important; margin-bottom: 1rem !important; } .example-row { background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 15px; padding: 1.5rem; margin: 1rem 0; transition: all 0.3s ease; align-items: center; } .example-row:hover { border-color: #667eea; transform: translateY(-2px); box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15); } .example-row .markdown { color: #2d3748 !important; } .example-row .markdown p { color: #2d3748 !important; margin: 0.5rem 0 !important; line-height: 1.5 !important; } .example-row .markdown strong { color: #1a202c !important; font-weight: 600 !important; } /* Example grid layout styles */ .example-grid-row { margin: 1rem 0; gap: 1rem; } .example-item { background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 15px; padding: 1rem; transition: all 0.3s ease; margin: 0.25rem; max-width: 250px; margin-left: auto; margin-right: auto; } .example-item:hover { border-color: #667eea; transform: translateY(-2px); box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15); } .example-caption { margin: 0.5rem 0 !important; min-height: 2.8rem !important; display: flex !important; align-items: flex-start !important; } .example-caption p { color: #2d3748 !important; font-size: 0.9rem !important; line-height: 1.4 !important; margin: 0.5rem 0 !important; } /* Multi-video gallery styles */ .additional-samples { margin-top: 1rem; gap: 0.5rem; } .additional-samples .gradio-video { border-radius: 10px; overflow: hidden; } /* Video gallery responsive layout */ .video-gallery { display: grid; gap: 1rem; margin-top: 1rem; } .video-gallery.single { grid-template-columns: 1fr; } .video-gallery.dual { grid-template-columns: 1fr 1fr; } .video-gallery.multi { grid-template-columns: repeat(2, 1fr); grid-template-rows: auto auto auto; } .footer-text { color: #718096 !important; text-align: center; padding: 2rem; font-size: 0.9rem; } /* Video component styling for consistent size */ .input-section video, .output-section video, .example-row video { width: 100% !important; height: 300px !important; object-fit: contain !important; border-radius: 10px !important; background-color: #000 !important; } .example-row video { height: 150px !important; } /* Fix for additional samples video display */ .additional-samples video { height: 150px !important; object-fit: contain !important; border-radius: 10px !important; background-color: #000 !important; } .additional-samples .gradio-video { border-radius: 10px !important; overflow: hidden !important; background-color: #000 !important; } .additional-samples .gradio-video > div { background-color: #000 !important; border-radius: 10px !important; } /* Video container styling */ .input-section .video-container, .output-section .video-container, .example-row .video-container { background-color: #000 !important; border-radius: 10px !important; display: flex !important; align-items: center !important; justify-content: center !important; overflow: hidden !important; } /* Ensure proper alignment */ .example-row { display: flex !important; align-items: stretch !important; } .example-row > div { display: flex !important; flex-direction: column !important; justify-content: center !important; } /* Video wrapper for better control */ .video-wrapper { position: relative !important; width: 100% !important; background: #000 !important; border-radius: 10px !important; overflow: hidden !important; display: flex !important; align-items: center !important; justify-content: center !important; } """ with gr.Blocks(css=css, title="HunyuanVideo-Foley") as app: # Main header with gr.Column(elem_classes=["main-header"]): gr.HTML("""
Text-Video-to-Audio Synthesis: Generate realistic audio from video and text descriptions
""") # Usage Guide with gr.Column(elem_classes=["status-card"]): gr.Markdown(""" ### 📋 Quick Start Guide **1.** Upload your video file\t**2.** Add optional text description\t**3.** Adjust sample numbers (1-6)\t**4.** Click Generate Audio 💡 For quick start, you can load the prepared examples by clicking the button. """, elem_classes=["usage-guide"]) # Main inference interface - Input and Results side by side with gr.Row(elem_classes=["main-interface"]): # Input section with gr.Column(scale=1, elem_classes=["input-section"]): gr.Markdown("### 📹 Video Input") video_input = gr.Video( label="Upload Video", info="Supported formats: MP4, AVI, MOV, etc.", height=300 ) text_input = gr.Textbox( label="🎯 Audio Description (English)", placeholder="A person walks on frozen ice", lines=3, info="Describe the audio you want to generate (optional)" ) with gr.Row(): guidance_scale = gr.Slider( minimum=1.0, maximum=10.0, value=4.5, step=0.1, label="🎚️ CFG Scale", ) inference_steps = gr.Slider( minimum=10, maximum=100, value=50, step=5, label="⚡ Steps", ) sample_nums = gr.Slider( minimum=1, maximum=6, value=1, step=1, label="🎲 Sample Nums", ) generate_btn = gr.Button( "🎵 Generate Audio", variant="primary", elem_classes=["generate-btn"] ) # Results section with gr.Column(scale=1, elem_classes=["output-section"]): gr.Markdown("### 🎥 Generated Results") # Multi-video gallery for displaying multiple generated samples with gr.Column(): # Primary video (Sample 1) video_output_1 = gr.Video( label="Sample 1", height=250, visible=True ) # Additional videos (Samples 2-6) - initially hidden with gr.Row(elem_classes=["additional-samples"]): with gr.Column(scale=1): video_output_2 = gr.Video( label="Sample 2", height=150, visible=False ) video_output_3 = gr.Video( label="Sample 3", height=150, visible=False ) with gr.Column(scale=1): video_output_4 = gr.Video( label="Sample 4", height=150, visible=False ) video_output_5 = gr.Video( label="Sample 5", height=150, visible=False ) # Sample 6 - full width video_output_6 = gr.Video( label="Sample 6", height=150, visible=False ) result_text = gr.Textbox( label="Status", interactive=False, lines=2 ) # Examples section at the bottom with gr.Column(elem_classes=["examples-section"]): gr.Markdown("### 🌟 Examples") gr.Markdown("Click on any example to load it into the interface above") # Define your custom examples here - 8 examples total examples_data = [ # Example 1 { "caption": "A person walks on frozen ice", "video_path": "examples/1_video.mp4", "result_path": "examples/1_result.mp4" }, # Example 2 { "caption": "With a faint sound as their hands parted, the two embraced, a soft 'mm' escaping between them.", "video_path": "examples/2_video.mp4", "result_path": "examples/2_result.mp4" }, # Example 3 { "caption": "The sound of the number 3's bouncing footsteps is as light and clear as glass marbles hitting the ground. Each step carries a magical sound.", "video_path": "examples/3_video.mp4", "result_path": "examples/3_result.mp4" }, # Example 4 { "caption": "gentle gurgling of the stream's current, and music plays in the background which is a beautiful and serene piano solo with a hint of classical charm, evoking a sense of peace and serenity in people's hearts.", "video_path": "examples/4_video.mp4", "result_path": "examples/4_result.mp4" }, # Example 5 - Add your new examples here { "caption": "snow crunching under the snowboard's edge.", "video_path": "examples/5_video.mp4", "result_path": "examples/5_result.mp4" }, # Example 6 { "caption": "The crackling of the fire, the whooshing of the flames, and the occasional crisp popping of charred leaves filled the forest.", "video_path": "examples/6_video.mp4", "result_path": "examples/6_result.mp4" }, # Example 7 { "caption": "humming of the scooter engine accelerates slowly.", "video_path": "examples/7_video.mp4", "result_path": "examples/7_result.mp4" }, # Example 8 { "caption": "splash of water and loud thud as person hits the surface.", "video_path": "examples/8_video.mp4", "result_path": "examples/8_result.mp4" } ] # Create example grid - 4 examples per row, 2 rows total example_buttons = [] for row in range(2): # 2 rows with gr.Row(elem_classes=["example-grid-row"]): for col in range(4): # 4 columns idx = row * 4 + col if idx < len(examples_data): example = examples_data[idx] with gr.Column(scale=1, elem_classes=["example-item"]): # Video thumbnail if os.path.exists(example['video_path']): example_video = gr.Video( value=example['video_path'], label=f"Example {idx+1}", interactive=False, show_label=True, height=180 ) else: example_video = gr.HTML(f"""📹 Video not found
{example['video_path']}