Spaces:

englissi
/

videogenerator

Sleeping

App Files Files Community

englissi commited on 22 days ago

Commit

1bc9c1f

verified ·

1 Parent(s): b2d14ee

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -60

app.py CHANGED Viewed

@@ -1,39 +1,60 @@
-import os, torch, tempfile
 import gradio as gr
 from diffusers import LTXPipeline, AutoModel
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video
-# --------- 모델 로드 함수 ---------
-def load_pipeline(device="cuda"):
-    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
     transformer = AutoModel.from_pretrained(
         "Lightricks/LTX-Video",
         subfolder="transformer",
         torch_dtype=dtype,
-        trust_remote_code=True,   # 중요: Placeholder 방지
-        variant="bf16" if dtype==torch.bfloat16 else None
     )
-    # fp8 layerwise casting (환경 미지원시 무시)
     try:
         transformer.enable_layerwise_casting(
-            storage_dtype=torch.float8_e4m3fn, compute_dtype=dtype
         )
-        fp8 = True
     except Exception:
-        fp8 = False
     pipe = LTXPipeline.from_pretrained(
         "Lightricks/LTX-Video",
         transformer=transformer,
         torch_dtype=dtype,
         trust_remote_code=True,
-        variant="bf16" if dtype==torch.bfloat16 else None
     ).to(device)
-    # group offloading (지원 안되면 무시)
     try:
         onload_device = torch.device(device)
         offload_device = torch.device("cpu")
@@ -43,77 +64,146 @@ def load_pipeline(device="cuda"):
             offload_type="leaf_level",
             use_stream=True
         )
-        apply_group_offloading(pipe.text_encoder, onload_device=onload_device,
-                               offload_type="block_level", num_blocks_per_group=2)
-        apply_group_offloading(pipe.vae, onload_device=onload_device,
-                               offload_type="leaf_level")
-        offload = True
     except Exception:
-        offload = False
-    return pipe, fp8, offload
-PIPE, FP8_OK, OFFLOAD_OK = load_pipeline("cuda" if torch.cuda.is_available() else "cpu")
-# --------- 비디오 생성 ---------
-def generate(prompt, negative_prompt,
-             width, height, num_frames, fps,
-             decode_timestep, decode_noise_scale,
-             steps, seed):
-    g = None
-    if seed is not None and seed >= 0:
-        g = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu").manual_seed(int(seed))
     with torch.inference_mode():
-        result = PIPE(
-            prompt=prompt,
-            negative_prompt=negative_prompt or None,
-            width=width,
-            height=height,
-            num_frames=num_frames,
-            fps=fps,
-            decode_timestep=decode_timestep,
-            decode_noise_scale=decode_noise_scale,
-            num_inference_steps=steps,
             generator=g
         )
-        frames = result.frames[0]
     tmpdir = tempfile.mkdtemp()
     save_path = os.path.join(tmpdir, "output.mp4")
-    export_to_video(frames, save_path, fps=fps)
-    return save_path, f"FP8: {'ON' if FP8_OK else 'OFF'} | Offloading: {'ON' if OFFLOAD_OK else 'OFF'}"
-# --------- Gradio UI ---------
 with gr.Blocks(title="LTX-Video Gradio") as demo:
-    gr.Markdown("## 🎬 LTX-Video Gradio Demo")
     with gr.Row():
-        prompt_in = gr.Textbox(label="Prompt", lines=6, value="A cinematic close-up of a smiling woman under warm sunset light.")
-        neg_in = gr.Textbox(label="Negative Prompt", lines=4, value="worst quality, inconsistent motion, blurry, jittery, distorted")
     with gr.Row():
-        width_in  = gr.Slider(256, 1024, step=8, value=768, label="Width")
-        height_in = gr.Slider(256, 1024, step=8, value=512, label="Height")
     with gr.Row():
-        frames_in = gr.Slider(17, 241, step=2, value=65, label="Frames (num_frames)")
-        fps_in    = gr.Slider(8, 30, step=1, value=24, label="FPS")
     with gr.Row():
-        dt_in  = gr.Slider(0.0, 0.2, step=0.001, value=0.03, label="decode_timestep")
-        dns_in = gr.Slider(0.0, 0.2, step=0.001, value=0.025, label="decode_noise_scale")
-        steps_in = gr.Slider(10, 75, step=1, value=40, label="Inference Steps")
-        seed_in  = gr.Number(value=-1, label="Seed (>=0 고정)")
-    btn = gr.Button("🎥 Generate Video", variant="primary")
     video_out = gr.Video(label="Output", autoplay=True)
-    info_out = gr.Markdown()
-    btn.click(fn=generate,
-              inputs=[prompt_in, neg_in, width_in, height_in,
-                      frames_in, fps_in, dt_in, dns_in, steps_in, seed_in],
-              outputs=[video_out, info_out])
 demo.queue().launch()

+import os, tempfile
+import numpy as np
+import torch
 import gradio as gr
 from diffusers import LTXPipeline, AutoModel
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video
+# -------------------------------------------------------------------
+# 환경 의존성 참고:
+# pip install -U torch torchvision accelerate transformers diffusers safetensors sentencepiece gradio imageio imageio-ffmpeg
+# (Spaces/도커라면 ffmpeg 바이너리 필요할 수 있음: apt-get update && apt-get install -y ffmpeg)
+# -------------------------------------------------------------------
+def load_pipeline(device: str = "cuda"):
+    """
+    LTX-Video 파이프라인 로드:
+    - sentencepiece 필요 (T5 토크나이저)
+    - trust_remote_code=True (Placeholder 이슈 방지)
+    - bf16/FP8/오프로딩은 가능한 경우에만 활성화
+    """
+    use_cuda = torch.cuda.is_available()
+    device = "cuda" if use_cuda else "cpu"
+    dtype = torch.bfloat16 if use_cuda else torch.float16  # bf16은 CUDA일 때만 의미
+    # 1) Transformer 로드
     transformer = AutoModel.from_pretrained(
         "Lightricks/LTX-Video",
         subfolder="transformer",
         torch_dtype=dtype,
+        trust_remote_code=True,
+        variant="bf16" if dtype == torch.bfloat16 else None
     )
+    # 2) FP8 layerwise casting (가능한 경우만)
+    fp8_ok = False
     try:
         transformer.enable_layerwise_casting(
+            storage_dtype=torch.float8_e4m3fn,
+            compute_dtype=dtype
         )
+        fp8_ok = True
     except Exception:
+        fp8_ok = False  # 환경 미지원 시 조용히 패스
+    # 3) Pipeline 로드
     pipe = LTXPipeline.from_pretrained(
         "Lightricks/LTX-Video",
         transformer=transformer,
         torch_dtype=dtype,
         trust_remote_code=True,
+        variant="bf16" if dtype == torch.bfloat16 else None
     ).to(device)
+    # 4) 그룹 오프로딩 (가능한 경우만)
+    offload_ok = False
     try:
         onload_device = torch.device(device)
         offload_device = torch.device("cpu")
             offload_type="leaf_level",
             use_stream=True
         )
+        apply_group_offloading(
+            pipe.text_encoder,
+            onload_device=onload_device,
+            offload_type="block_level",
+            num_blocks_per_group=2
+        )
+        apply_group_offloading(
+            pipe.vae,
+            onload_device=onload_device,
+            offload_type="leaf_level"
+        )
+        offload_ok = True
     except Exception:
+        offload_ok = False
+    return pipe, fp8_ok, offload_ok, device
+PIPE, FP8_OK, OFFLOAD_OK, DEVICE = load_pipeline()
+def _to_uint8_frames(frames):
+    """
+    (T,H,W,C) float/torch 텐서를 안전하게 uint8 numpy로 변환
+    """
+    import numpy as np
+    if isinstance(frames, torch.Tensor):
+        frames = frames.detach().to("cpu").numpy()
+    if frames.ndim == 3:
+        # (T,H,W) -> (T,H,W,1)
+        frames = frames[..., None]
+    assert frames.ndim == 4, f"Unexpected frames shape: {frames.shape}"
+    if frames.dtype != np.uint8:
+        # 0~1 또는 0~255 범위에 맞춰 스케일링
+        mx = frames.max()
+        if mx <= 1.0:
+            frames = (np.clip(frames, 0, 1) * 255).astype(np.uint8)
+        else:
+            frames = np.clip(frames, 0, 255).astype(np.uint8)
+    return frames
+def generate_video(
+    prompt, negative_prompt,
+    width, height, num_frames, fps,
+    decode_timestep, decode_noise_scale,
+    steps, seed
+):
+    # 시드
+    g = None
+    if seed is not None:
+        try:
+            s = int(seed)
+            if s >= 0:
+                g = torch.Generator(device=DEVICE).manual_seed(s)
+        except Exception:
+            pass
+    # 추론
     with torch.inference_mode():
+        out = PIPE(
+            prompt=(prompt or "").strip(),
+            negative_prompt=(negative_prompt or "").strip() or None,
+            width=int(width),
+            height=int(height),
+            num_frames=int(num_frames),
+            fps=int(fps),
+            decode_timestep=float(decode_timestep),
+            decode_noise_scale=float(decode_noise_scale),
+            num_inference_steps=int(steps),
             generator=g
         )
+        frames = out.frames[0]  # 예상: (T, H, W, C) float / torch
+    # 프레임을 안전한 형식으로 변환
+    frames = _to_uint8_frames(frames)
+    # 저장 경로
     tmpdir = tempfile.mkdtemp()
     save_path = os.path.join(tmpdir, "output.mp4")
+    # 1순위: diffusers 내장 saver
+    try:
+        export_to_video(frames, save_path, fps=int(fps))
+    except Exception:
+        # 폴백: imageio-ffmpeg
+        import imageio.v3 as iio
+        iio.imwrite(save_path, frames, fps=int(fps), codec="libx264")
+    info = (
+        f"FP8: {'ON' if FP8_OK else 'OFF'} | "
+        f"Offloading: {'ON' if OFFLOAD_OK else 'OFF'} | "
+        f"Device: {DEVICE} | "
+        f"Frames: {frames.shape} | FPS: {int(fps)}"
+    )
+    return save_path, info
+# --------------------------- Gradio UI ---------------------------
 with gr.Blocks(title="LTX-Video Gradio") as demo:
+    gr.Markdown("## 🎬 LTX-Video — Prompt to Short Video")
     with gr.Row():
+        prompt_in = gr.Textbox(
+            label="Prompt",
+            lines=6,
+            value="A cinematic close-up of a smiling woman under warm sunset light."
+        )
+        neg_in = gr.Textbox(
+            label="Negative Prompt",
+            lines=4,
+            value="worst quality, inconsistent motion, blurry, jittery, distorted"
+        )
     with gr.Row():
+        width_in  = gr.Slider(256, 1024, value=768, step=8, label="Width")
+        height_in = gr.Slider(256, 1024, value=512, step=8, label="Height")
     with gr.Row():
+        frames_in = gr.Slider(17, 241, value=65, step=2, label="num_frames")
+        fps_in    = gr.Slider(8, 30, value=24, step=1, label="FPS")
     with gr.Row():
+        dt_in  = gr.Slider(0.0, 0.2, value=0.03, step=0.001, label="decode_timestep")
+        dns_in = gr.Slider(0.0, 0.2, value=0.025, step=0.001, label="decode_noise_scale")
+        steps_in = gr.Slider(10, 75, value=40, step=1, label="num_inference_steps")
+        seed_in  = gr.Number(value=-1, label="Seed (>=0 to fix)")
+    gen_btn = gr.Button("🎥 Generate", variant="primary")
     video_out = gr.Video(label="Output", autoplay=True)
+    info_out  = gr.Markdown()
+    gen_btn.click(
+        fn=generate_video,
+        inputs=[prompt_in, neg_in, width_in, height_in, frames_in, fps_in, dt_in, dns_in, steps_in, seed_in],
+        outputs=[video_out, info_out]
+    )
 demo.queue().launch()