|
|
|
import os |
|
import argparse |
|
|
|
|
|
|
|
import torch |
|
import gradio as gr |
|
|
|
|
|
import numpy as np |
|
import einops |
|
import traceback |
|
|
|
from PIL import Image |
|
from diffusers import AutoencoderKLHunyuanVideo |
|
from transformers import ( |
|
LlamaModel, CLIPTextModel, |
|
LlamaTokenizerFast, CLIPTokenizer, |
|
SiglipImageProcessor, SiglipVisionModel |
|
) |
|
|
|
from diffusers_helper.hf_login import login |
|
from diffusers_helper.hunyuan import ( |
|
encode_prompt_conds, vae_decode, vae_encode, |
|
vae_decode_fake |
|
) |
|
from diffusers_helper.utils import ( |
|
save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, |
|
resize_and_center_crop, generate_timestamp |
|
) |
|
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked |
|
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan |
|
from diffusers_helper.memory import ( |
|
gpu, get_cuda_free_memory_gb, unload_complete_models, load_model_as_complete, |
|
DynamicSwapInstaller, move_model_to_device_with_memory_preservation, |
|
offload_model_from_device_for_memory_preservation, fake_diffusers_current_device |
|
) |
|
from diffusers_helper.clip_vision import hf_clip_vision_encode |
|
from diffusers_helper.thread_utils import AsyncStream, async_run |
|
|
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--share', action='store_true') |
|
parser.add_argument('--server', type=str, default='0.0.0.0') |
|
parser.add_argument('--port', type=int, required=False) |
|
parser.add_argument('--inbrowser', action='store_true') |
|
args = parser.parse_args() |
|
|
|
os.environ['HF_HOME'] = os.path.abspath( |
|
os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')) |
|
) |
|
|
|
print(args) |
|
|
|
free_mem_gb = get_cuda_free_memory_gb(gpu) |
|
high_vram = free_mem_gb > 60 |
|
|
|
print(f'Free VRAM {free_mem_gb} GB') |
|
print(f'High-VRAM Mode: {high_vram}') |
|
|
|
|
|
text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() |
|
text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu() |
|
tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer') |
|
tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2') |
|
vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu() |
|
|
|
feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor') |
|
image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu() |
|
|
|
transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu() |
|
|
|
vae.eval(), text_encoder.eval(), text_encoder_2.eval(), image_encoder.eval(), transformer.eval() |
|
|
|
|
|
|
|
|
|
|
|
if not high_vram: |
|
vae.enable_slicing() |
|
vae.enable_tiling() |
|
|
|
transformer.high_quality_fp32_output_for_inference = True |
|
|
|
|
|
transformer.to(dtype=torch.bfloat16) |
|
vae.to(dtype=torch.float16) |
|
image_encoder.to(dtype=torch.float16) |
|
text_encoder.to(dtype=torch.float16) |
|
text_encoder_2.to(dtype=torch.float16) |
|
|
|
for model in [vae, text_encoder, text_encoder_2, image_encoder, transformer]: |
|
model.requires_grad_(False) |
|
|
|
|
|
|
|
|
|
if not high_vram: |
|
|
|
DynamicSwapInstaller.install_model(transformer, device=gpu) |
|
DynamicSwapInstaller.install_model(text_encoder, device=gpu) |
|
else: |
|
transformer.to(gpu) |
|
|
|
stream = AsyncStream() |
|
|
|
outputs_folder = './outputs/' |
|
os.makedirs(outputs_folder, exist_ok=True) |
|
|
|
|
|
def make_progress_bar_css(): |
|
return """ |
|
body, .gradio-container { |
|
background-color: #000000 !important; |
|
color: #FFFFFF !important; |
|
} |
|
.gr-button, .gr-input, .gr-textbox, .gr-slider, .gr-checkbox { |
|
background-color: #1a1a1a !important; |
|
color: #ffffff !important; |
|
border-color: #444 !important; |
|
} |
|
.gr-button:hover { |
|
background-color: #333 !important; |
|
} |
|
.gr-markdown { |
|
color: #ddd !important; |
|
} |
|
.gr-image-preview, .gr-video { |
|
background-color: #111 !important; |
|
} |
|
""" |
|
|
|
def end_process(): |
|
stream.input_queue.push('end') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache): |
|
global stream |
|
assert input_image is not None, 'No input image!' |
|
|
|
yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True) |
|
|
|
stream = AsyncStream() |
|
|
|
async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache) |
|
|
|
output_filename = None |
|
|
|
while True: |
|
flag, data = stream.output_queue.next() |
|
|
|
if flag == 'file': |
|
output_filename = data |
|
yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True) |
|
|
|
if flag == 'progress': |
|
preview, desc, html = data |
|
yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) |
|
|
|
if flag == 'end': |
|
yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False) |
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
quick_prompts = [ |
|
'The girl dances gracefully, with clear movements, full of charm.', |
|
'A character doing some simple body movements.', |
|
] |
|
quick_prompts = [[x] for x in quick_prompts] |
|
|
|
|
|
css = make_progress_bar_css() |
|
|
|
block = gr.Blocks(css=css).queue() |
|
with block: |
|
gr.Markdown('# FramePack') |
|
end_button = gr.Button(value="End Generation", interactive=False) |
|
|
|
with gr.Group(): |
|
use_teacache = gr.Checkbox(label='Use TeaCache', value=True) |
|
n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) |
|
|
|
seed = gr.Number(label="Seed", value=31337, precision=0) |
|
|
|
total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1) |
|
latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) |
|
steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1) |
|
cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) |
|
gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01) |
|
rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) |
|
gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB)", minimum=6, maximum=128, value=6, step=0.1) |
|
|
|
|
|
|
|
with gr.Column(): |
|
preview_image = gr.Image(label="Next Latents", height=200, visible=False) |
|
result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True) |
|
gr.Markdown('Note: The ending actions are generated before the start. Wait for full video.') |
|
progress_desc = gr.Markdown('', elem_classes='no-generating-animation') |
|
progress_bar = gr.HTML('', elem_classes='no-generating-animation') |
|
|
|
ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache] |
|
start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]) |
|
end_button.click(fn=end_process) |
|
|
|
|
|
block.launch( |
|
server_name=args.server, |
|
server_port=args.port, |