STUDIO

Sleeping

App Files Files Community

openfree commited on May 28

Commit

5d025b7

verified ·

1 Parent(s): 63f5669

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -186

app.py CHANGED Viewed

@@ -1,10 +1,25 @@
 import gradio as gr
 import numpy as np
 from PIL import Image, ImageDraw
 from gradio_client import Client, handle_file
 import random
 import tempfile
-import os
 import logging
 import torch
 from diffusers import AutoencoderKL, TCDScheduler
@@ -26,21 +41,6 @@ from concurrent.futures import ThreadPoolExecutor
 # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
 os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
-# Spaces GPU
-import os
-IS_SPACES = os.environ.get("SPACE_ID") is not None
-if IS_SPACES:
-    import spaces
-else:
-    # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
-    class spaces:
-        @staticmethod
-        def GPU(duration=None):
-            def decorator(func):
-                return func
-            return decorator
 # GPU 초기화를 위한 간단한 함수 (Spaces 환경에서 필수)
 @spaces.GPU(duration=1)
 def gpu_warmup():
@@ -50,7 +50,7 @@ def gpu_warmup():
         del dummy
     return "GPU ready"
-# MMAudio imports
 try:
     import mmaudio
 except ImportError:
@@ -64,6 +64,9 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
 # 기존 코드의 모든 설정과 초기화 부분 유지
 torch.set_float32_matmul_precision("medium")
@@ -77,130 +80,21 @@ else:
 logging.info(f"Using device: {device}")
-# BiRefNet 모델 로드
-try:
-    birefnet = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
-    birefnet.to(device)
-    birefnet_lite = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True)
-    birefnet_lite.to(device)
-    transform_image = transforms.Compose([
-        transforms.Resize((768, 768)),
-        transforms.ToTensor(),
-        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-    ])
-    BIREFNET_MODEL_LOADED = True
-except Exception as e:
-    logging.error(f"Failed to load BiRefNet models: {str(e)}")
-    BIREFNET_MODEL_LOADED = False
-# ControlNet 모델 로드 (기존 코드)
-try:
-    from controlnet_union import ControlNetModel_Union
-    from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
-    # ControlNet 설정 및 로드
-    config_file = hf_hub_download(
-        "xinsir/controlnet-union-sdxl-1.0",
-        filename="config_promax.json",
-    )
-    config = ControlNetModel_Union.load_config(config_file)
-    controlnet_model = ControlNetModel_Union.from_config(config)
-    model_file = hf_hub_download(
-        "xinsir/controlnet-union-sdxl-1.0",
-        filename="diffusion_pytorch_model_promax.safetensors",
-    )
-    state_dict = load_state_dict(model_file)
-    loaded_keys = list(state_dict.keys())
-    result = ControlNetModel_Union._load_pretrained_model(
-        controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
-    )
-    model = result[0]
-    model = model.to(device=device, dtype=torch.float16 if device.type == "cuda" else torch.float32)
-    # VAE 로드
-    vae = AutoencoderKL.from_pretrained(
-        "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
-    ).to(device)
-    # 파이프라인 로드
-    pipe = StableDiffusionXLFillPipeline.from_pretrained(
-        "SG161222/RealVisXL_V5.0_Lightning",
-        torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
-        vae=vae,
-        controlnet=model,
-        variant="fp16" if device.type == "cuda" else None,
-    ).to(device)
-    pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
-    OUTPAINT_MODEL_LOADED = True
-except Exception as e:
-    logging.error(f"Failed to load outpainting models: {str(e)}")
-    OUTPAINT_MODEL_LOADED = False
-# MMAudio 모델 설정 (기존 코드)
-if torch.cuda.is_available():
-    mmaudio_dtype = torch.bfloat16
-else:
-    mmaudio_dtype = torch.float32
-# MMAudio 모델 초기화 (기존 코드)
-try:
-    model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
-    model_mmaudio.download_if_needed()
-    output_dir = Path('./output/gradio')
-    setup_eval_logging()
-    # 번역기 설정
-    try:
-        translator = pipeline("translation",
-                             model="Helsinki-NLP/opus-mt-ko-en",
-                             device="cpu",
-                             use_fast=True,
-                             trust_remote_code=False)
-    except Exception as e:
-        logging.warning(f"Failed to load translation model: {e}")
-        translator = None
-    def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
-        with torch.cuda.device(device):
-            seq_cfg = model_mmaudio.seq_cfg
-            net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, mmaudio_dtype).eval()
-            net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
-            logging.info(f'Loaded weights from {model_mmaudio.model_path}')
-            feature_utils = FeaturesUtils(
-                tod_vae_ckpt=model_mmaudio.vae_path,
-                synchformer_ckpt=model_mmaudio.synchformer_ckpt,
-                enable_conditions=True,
-                mode=model_mmaudio.mode,
-                bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
-                need_vae_encoder=False
-            ).to(device, mmaudio_dtype).eval()
-            return net, feature_utils, seq_cfg
-    net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model()
-    MMAUDIO_MODEL_LOADED = True
-except Exception as e:
-    logging.error(f"Failed to load MMAudio models: {str(e)}")
-    MMAUDIO_MODEL_LOADED = False
-    translator = None
 # API URLs
 TEXT2IMG_API_URL = "http://211.233.58.201:7896"
 VIDEO_API_URL = "http://211.233.58.201:7875"
-# 로깅 설정
-logging.basicConfig(level=logging.INFO)
-# Image size presets (기존 코드)
 IMAGE_PRESETS = {
     "커스텀": {"width": 1024, "height": 1024},
     "1:1 정사각형": {"width": 1024, "height": 1024},
@@ -217,6 +111,119 @@ IMAGE_PRESETS = {
     "LinkedIn 배너": {"width": 1584, "height": 396},
 }
 # 기존 함수들 모두 유지
 def update_dimensions(preset):
     if preset in IMAGE_PRESETS:
@@ -332,9 +339,7 @@ def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
     mask = Image.new('L', target_size, 255)
     mask_draw = ImageDraw.Draw(mask)
-    # 마스크 영역 그리기 (영어 정렬과 매칭)
-    white_gaps_patch = 2
     left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
     right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
     top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
@@ -374,7 +379,11 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
     if image is None:
         return None
-    if not OUTPAINT_MODEL_LOADED:
         return Image.new('RGB', (width, height), (200, 200, 200))
     try:
@@ -391,16 +400,16 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
         final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
         # GPU에서 실행
-        with torch.autocast(device_type=device.type, dtype=torch.float16 if device.type == "cuda" else torch.float32):
             (
                 prompt_embeds,
                 negative_prompt_embeds,
                 pooled_prompt_embeds,
                 negative_pooled_prompt_embeds,
-            ) = pipe.encode_prompt(final_prompt, str(device), True)
             # 생성 프로세스
-            for generated_image in pipe(
                 prompt_embeds=prompt_embeds,
                 negative_prompt_embeds=negative_prompt_embeds,
                 pooled_prompt_embeds=pooled_prompt_embeds,
@@ -427,12 +436,12 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
 # MMAudio 관련 함수들
 def translate_prompt(text):
     try:
-        if translator is None:
             return text
         if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
             with torch.no_grad():
-                translation = translator(text)[0]['translation_text']
             return translation
         return text
     except Exception as e:
@@ -443,7 +452,11 @@ def translate_prompt(text):
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
-    if not MMAUDIO_MODEL_LOADED:
         return None
     prompt = translate_prompt(prompt)
@@ -456,14 +469,14 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
     clip_frames, sync_frames, duration = load_video(video, duration)
     clip_frames = clip_frames.unsqueeze(0)
     sync_frames = sync_frames.unsqueeze(0)
-    seq_cfg.duration = duration
-    net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
     audios = generate(clip_frames,
                       sync_frames, [prompt],
                       negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net_mmaudio,
                       fm=fm,
                       rng=rng,
                       cfg_strength=cfg_strength)
@@ -473,19 +486,19 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
     make_video(video,
                video_save_path,
                audio,
-               sampling_rate=seq_cfg.sampling_rate,
-               duration_sec=seq_cfg.duration)
     return video_save_path
 # 비디오 배경제거 관련 함수들
 def process_bg_image(image, bg, fast_mode=False):
     """단일 이미지 배경 처리"""
-    if not BIREFNET_MODEL_LOADED:
         return image
     image_size = image.size
     input_images = transform_image(image).unsqueeze(0).to(device)
-    model = birefnet_lite if fast_mode else birefnet
     with torch.no_grad():
         preds = model(input_images)[-1].sigmoid().cpu()
@@ -528,7 +541,11 @@ def process_video_frame(frame, bg_type, bg, fast_mode, bg_frame_index, backgroun
 def process_video_bg(vid, bg_type="색상", bg_image=None, bg_video=None, color="#00FF00",
                      fps=0, video_handling="slow_down", fast_mode=True, max_workers=10):
     """비디오 배경 처리 메인 함수"""
-    if not BIREFNET_MODEL_LOADED:
         yield gr.update(visible=False), gr.update(visible=True), "BiRefNet 모델을 로드하지 못했습니다."
         yield None, None, "BiRefNet 모델을 로드하지 못했습니다."
         return
@@ -697,18 +714,6 @@ def merge_videos_with_audio(video_files, audio_file, audio_volume, output_fps):
         logging.error(f"Video merge error: {str(e)}")
         return None, f"❌ 오류 발생: {str(e)}"
-# GPU 초기화 함수 추가
-def dummy_gpu_init():
-    """GPU 초기화를 위한 더미 함수"""
-    if torch.cuda.is_available():
-        try:
-            # 간단한 텐서 연산으로 GPU 초기화
-            dummy_tensor = torch.zeros(1).to(device)
-            del dummy_tensor
-            logging.info("GPU initialized successfully")
-        except Exception as e:
-            logging.warning(f"GPU initialization warning: {e}")
 # CSS
 css = """
 :root {
@@ -751,6 +756,10 @@ demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 & 오디오 생성기"
 with demo:
     gr.Markdown("# 🎨 Ginigen 스튜디오")
     with gr.Tabs() as tabs:
         # 첫 번째 탭: 텍스트 to 이미지
@@ -896,7 +905,7 @@ with demo:
                         gr.Markdown("### 🎵 오디오 생성 설정")
                         audio_prompt = gr.Textbox(
-                            label="프롬프트 (한글 지원)" if MMAUDIO_MODEL_LOADED and translator else "프롬프트",
                             placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)",
                             lines=3
                         )
@@ -927,9 +936,6 @@ with demo:
                             label="오디오가 추가된 비디오",
                             interactive=False
                         )
-                        if not MMAUDIO_MODEL_LOADED:
-                            gr.Markdown("⚠️ MMAudio 모델을 로드하지 못했습니다. 이 기능은 사용할 수 없습니다.")
         # 네 번째 탭: 비디오 편집
         with gr.Tab("비디오 편집", elem_classes="tabitem"):
@@ -1077,9 +1083,6 @@ with demo:
                         )
                         bg_remove_btn = gr.Button("🎬 배경 변경", variant="primary", elem_id="bg-remove-btn")
-                        if not BIREFNET_MODEL_LOADED:
-                            gr.Markdown("⚠️ BiRefNet 모델을 로드하지 못했습니다. 이 기능은 사용할 수 없습니다.")
                 # 출력 컬럼
                 with gr.Column(scale=1):
@@ -1100,6 +1103,17 @@ with demo:
                         긴 비디오는 작은 조각으로 나누어 처리하세요.
                         """)
     # 이벤트 연결 - 첫 번째 탭
     size_preset.change(update_dimensions, [size_preset], [width, height])
@@ -1167,20 +1181,12 @@ with demo:
                 fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider],
         outputs=[stream_image, output_bg_video, time_textbox]
     )
-# GPU 초기화 (Spaces 환경에서 필요)
-try:
-    if IS_SPACES and torch.cuda.is_available():
-        # Spaces 환경에서 GPU 워밍업 실행
-        gpu_warmup()
-        logging.info("GPU warmed up successfully")
-    elif torch.cuda.is_available():
-        dummy_gpu_init()
-except Exception as e:
-    logging.warning(f"GPU initialization warning: {e}")
 if __name__ == "__main__":
-    # Spaces 환경에서 추가 GPU 체크
     if IS_SPACES:
         try:
             gpu_warmup()

+# Spaces GPU - 반드시 첫 번째로 import해야 함!
+import os
+IS_SPACES = os.environ.get("SPACE_ID") is not None
+if IS_SPACES:
+    import spaces
+else:
+    # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
+    class spaces:
+        @staticmethod
+        def GPU(duration=None):
+            def decorator(func):
+                return func
+            return decorator
+# 이제 다른 라이브러리들을 import
 import gradio as gr
 import numpy as np
 from PIL import Image, ImageDraw
 from gradio_client import Client, handle_file
 import random
 import tempfile
 import logging
 import torch
 from diffusers import AutoencoderKL, TCDScheduler
 # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
 os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
 # GPU 초기화를 위한 간단한 함수 (Spaces 환경에서 필수)
 @spaces.GPU(duration=1)
 def gpu_warmup():
         del dummy
     return "GPU ready"
+# MMAudio imports - spaces import 이후에 와야 함
 try:
     import mmaudio
 except ImportError:
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
+# 로깅 설정
+logging.basicConfig(level=logging.INFO)
 # 기존 코드의 모든 설정과 초기화 부분 유지
 torch.set_float32_matmul_precision("medium")
 logging.info(f"Using device: {device}")
+# 전역 변수로 모델 상태 관리
+MODELS_LOADED = False
+BIREFNET_MODEL = None
+BIREFNET_LITE_MODEL = None
+OUTPAINT_PIPE = None
+MMAUDIO_NET = None
+MMAUDIO_FEATURE_UTILS = None
+MMAUDIO_SEQ_CFG = None
+TRANSLATOR = None
 # API URLs
 TEXT2IMG_API_URL = "http://211.233.58.201:7896"
 VIDEO_API_URL = "http://211.233.58.201:7875"
+# Image size presets
 IMAGE_PRESETS = {
     "커스텀": {"width": 1024, "height": 1024},
     "1:1 정사각형": {"width": 1024, "height": 1024},
     "LinkedIn 배너": {"width": 1584, "height": 396},
 }
+# Transform for BiRefNet
+transform_image = transforms.Compose([
+    transforms.Resize((768, 768)),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+])
+@spaces.GPU(duration=60)
+def load_models():
+    """모든 모델을 로드하는 함수"""
+    global MODELS_LOADED, BIREFNET_MODEL, BIREFNET_LITE_MODEL, OUTPAINT_PIPE
+    global MMAUDIO_NET, MMAUDIO_FEATURE_UTILS, MMAUDIO_SEQ_CFG, TRANSLATOR
+    if MODELS_LOADED:
+        return True
+    try:
+        # BiRefNet 모델 로드
+        logging.info("Loading BiRefNet models...")
+        BIREFNET_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
+        BIREFNET_MODEL.to(device)
+        BIREFNET_LITE_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True)
+        BIREFNET_LITE_MODEL.to(device)
+        # ControlNet 및 Outpainting 모델 로드
+        logging.info("Loading ControlNet models...")
+        from controlnet_union import ControlNetModel_Union
+        from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
+        config_file = hf_hub_download(
+            "xinsir/controlnet-union-sdxl-1.0",
+            filename="config_promax.json",
+        )
+        config = ControlNetModel_Union.load_config(config_file)
+        controlnet_model = ControlNetModel_Union.from_config(config)
+        model_file = hf_hub_download(
+            "xinsir/controlnet-union-sdxl-1.0",
+            filename="diffusion_pytorch_model_promax.safetensors",
+        )
+        state_dict = load_state_dict(model_file)
+        loaded_keys = list(state_dict.keys())
+        result = ControlNetModel_Union._load_pretrained_model(
+            controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
+        )
+        model = result[0]
+        model = model.to(device=device, dtype=torch_dtype)
+        # VAE 로드
+        vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype
+        ).to(device)
+        # 파이프라인 로드
+        OUTPAINT_PIPE = StableDiffusionXLFillPipeline.from_pretrained(
+            "SG161222/RealVisXL_V5.0_Lightning",
+            torch_dtype=torch_dtype,
+            vae=vae,
+            controlnet=model,
+            variant="fp16" if device.type == "cuda" else None,
+        ).to(device)
+        OUTPAINT_PIPE.scheduler = TCDScheduler.from_config(OUTPAINT_PIPE.scheduler.config)
+        # MMAudio 모델 로드
+        logging.info("Loading MMAudio models...")
+        model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
+        model_mmaudio.download_if_needed()
+        setup_eval_logging()
+        # 번역기 설정
+        try:
+            TRANSLATOR = pipeline("translation",
+                                 model="Helsinki-NLP/opus-mt-ko-en",
+                                 device="cpu",
+                                 use_fast=True,
+                                 trust_remote_code=False)
+        except Exception as e:
+            logging.warning(f"Failed to load translation model: {e}")
+            TRANSLATOR = None
+        # MMAudio 모델 초기화
+        if torch.cuda.is_available():
+            mmaudio_dtype = torch.bfloat16
+        else:
+            mmaudio_dtype = torch.float32
+        with torch.cuda.device(device):
+            MMAUDIO_SEQ_CFG = model_mmaudio.seq_cfg
+            MMAUDIO_NET = get_my_mmaudio(model_mmaudio.model_name).to(device, mmaudio_dtype).eval()
+            MMAUDIO_NET.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
+            logging.info(f'Loaded weights from {model_mmaudio.model_path}')
+            MMAUDIO_FEATURE_UTILS = FeaturesUtils(
+                tod_vae_ckpt=model_mmaudio.vae_path,
+                synchformer_ckpt=model_mmaudio.synchformer_ckpt,
+                enable_conditions=True,
+                mode=model_mmaudio.mode,
+                bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
+                need_vae_encoder=False
+            ).to(device, mmaudio_dtype).eval()
+        MODELS_LOADED = True
+        logging.info("All models loaded successfully!")
+        return True
+    except Exception as e:
+        logging.error(f"Failed to load models: {str(e)}")
+        return False
 # 기존 함수들 모두 유지
 def update_dimensions(preset):
     if preset in IMAGE_PRESETS:
     mask = Image.new('L', target_size, 255)
     mask_draw = ImageDraw.Draw(mask)
+    # 마스크 영역 그리기
     left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
     right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
     top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
     if image is None:
         return None
+    # 모델 로드 확인
+    if not MODELS_LOADED:
+        load_models()
+    if OUTPAINT_PIPE is None:
         return Image.new('RGB', (width, height), (200, 200, 200))
     try:
         final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
         # GPU에서 실행
+        with torch.autocast(device_type=device.type, dtype=torch_dtype):
             (
                 prompt_embeds,
                 negative_prompt_embeds,
                 pooled_prompt_embeds,
                 negative_pooled_prompt_embeds,
+            ) = OUTPAINT_PIPE.encode_prompt(final_prompt, str(device), True)
             # 생성 프로세스
+            for generated_image in OUTPAINT_PIPE(
                 prompt_embeds=prompt_embeds,
                 negative_prompt_embeds=negative_prompt_embeds,
                 pooled_prompt_embeds=pooled_prompt_embeds,
 # MMAudio 관련 함수들
 def translate_prompt(text):
     try:
+        if TRANSLATOR is None:
             return text
         if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
             with torch.no_grad():
+                translation = TRANSLATOR(text)[0]['translation_text']
             return translation
         return text
     except Exception as e:
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
+    # 모델 로드 확인
+    if not MODELS_LOADED:
+        load_models()
+    if MMAUDIO_NET is None:
         return None
     prompt = translate_prompt(prompt)
     clip_frames, sync_frames, duration = load_video(video, duration)
     clip_frames = clip_frames.unsqueeze(0)
     sync_frames = sync_frames.unsqueeze(0)
+    MMAUDIO_SEQ_CFG.duration = duration
+    MMAUDIO_NET.update_seq_lengths(MMAUDIO_SEQ_CFG.latent_seq_len, MMAUDIO_SEQ_CFG.clip_seq_len, MMAUDIO_SEQ_CFG.sync_seq_len)
     audios = generate(clip_frames,
                       sync_frames, [prompt],
                       negative_text=[negative_prompt],
+                      feature_utils=MMAUDIO_FEATURE_UTILS,
+                      net=MMAUDIO_NET,
                       fm=fm,
                       rng=rng,
                       cfg_strength=cfg_strength)
     make_video(video,
                video_save_path,
                audio,
+               sampling_rate=MMAUDIO_SEQ_CFG.sampling_rate,
+               duration_sec=MMAUDIO_SEQ_CFG.duration)
     return video_save_path
 # 비디오 배경제거 관련 함수들
 def process_bg_image(image, bg, fast_mode=False):
     """단일 이미지 배경 처리"""
+    if BIREFNET_MODEL is None or BIREFNET_LITE_MODEL is None:
         return image
     image_size = image.size
     input_images = transform_image(image).unsqueeze(0).to(device)
+    model = BIREFNET_LITE_MODEL if fast_mode else BIREFNET_MODEL
     with torch.no_grad():
         preds = model(input_images)[-1].sigmoid().cpu()
 def process_video_bg(vid, bg_type="색상", bg_image=None, bg_video=None, color="#00FF00",
                      fps=0, video_handling="slow_down", fast_mode=True, max_workers=10):
     """비디오 배경 처리 메인 함수"""
+    # 모델 로드 확인
+    if not MODELS_LOADED:
+        load_models()
+    if BIREFNET_MODEL is None:
         yield gr.update(visible=False), gr.update(visible=True), "BiRefNet 모델을 로드하지 못했습니다."
         yield None, None, "BiRefNet 모델을 로드하지 못했습니다."
         return
         logging.error(f"Video merge error: {str(e)}")
         return None, f"❌ 오류 발생: {str(e)}"
 # CSS
 css = """
 :root {
 with demo:
     gr.Markdown("# 🎨 Ginigen 스튜디오")
+    gr.Markdown("처음 사용 시 모델 로딩에 시간이 걸릴 수 있습니다. 잠시만 기다려주세요.")
+    # 모델 로드 상태 표시
+    model_status = gr.Textbox(label="모델 상태", value="모델 로딩 대기 중...", interactive=False)
     with gr.Tabs() as tabs:
         # 첫 번째 탭: 텍스트 to 이미지
                         gr.Markdown("### 🎵 오디오 생성 설정")
                         audio_prompt = gr.Textbox(
+                            label="프롬프트 (한글 지원)",
                             placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)",
                             lines=3
                         )
                             label="오디오가 추가된 비디오",
                             interactive=False
                         )
         # 네 번째 탭: 비디오 편집
         with gr.Tab("비디오 편집", elem_classes="tabitem"):
                         )
                         bg_remove_btn = gr.Button("🎬 배경 변경", variant="primary", elem_id="bg-remove-btn")
                 # 출력 컬럼
                 with gr.Column(scale=1):
                         긴 비디오는 작은 조각으로 나누어 처리하세요.
                         """)
+    # 모델 로드 함수 실행
+    def on_demo_load():
+        try:
+            if IS_SPACES:
+                # Spaces 환경에서 GPU 워밍업
+                gpu_warmup()
+            # 모델 로드는 첫 번째 GPU 함수 호출 시 자동으로 수행됨
+            return "모델 로딩 준비 완료"
+        except Exception as e:
+            return f"초기화 오류: {str(e)}"
     # 이벤트 연결 - 첫 번째 탭
     size_preset.change(update_dimensions, [size_preset], [width, height])
                 fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider],
         outputs=[stream_image, output_bg_video, time_textbox]
     )
+    # 데모 로드 시 실행
+    demo.load(on_demo_load, outputs=model_status)
 if __name__ == "__main__":
+    # Spaces 환경에서 추가 체크
     if IS_SPACES:
         try:
             gpu_warmup()