Spaces:

mohamedsobhi777
/

FramerComfy_Simple_Image2Image_9268381027

Runtime error

App Files Files Community

FramerComfy_Simple_Image2Image_9268381027 / comfy_extras /nodes_hunyuan.py

mohamedsobhi777

Synced repo using 'sync_with_huggingface' Github Action

86d4cbe verified 3 months ago

raw

history blame contribute delete

5.64 kB

	import nodes
	import node_helpers
	import torch
	import comfy.model_management


	class CLIPTextEncodeHunyuanDiT:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"clip": ("CLIP", ),
	"bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	"mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	}}
	RETURN_TYPES = ("CONDITIONING",)
	FUNCTION = "encode"

	CATEGORY = "advanced/conditioning"

	def encode(self, clip, bert, mt5xl):
	tokens = clip.tokenize(bert)
	tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]

	return (clip.encode_from_tokens_scheduled(tokens), )

	class EmptyHunyuanLatentVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
	RETURN_TYPES = ("LATENT",)
	FUNCTION = "generate"

	CATEGORY = "latent/video"

	def generate(self, width, height, length, batch_size=1):
	latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
	return ({"samples":latent}, )

	PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
	"<\|start_header_id\|>system<\|end_header_id\|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
	"1. The main content and theme of the video."
	"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
	"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
	"4. background environment, light, style and atmosphere."
	"5. camera angles, movements, and transitions used in the video:<\|eot_id\|>\n\n"
	"<\|start_header_id\|>user<\|end_header_id\|>\n\n{}<\|eot_id\|>"
	"<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"
	)

	class TextEncodeHunyuanVideo_ImageToVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"clip": ("CLIP", ),
	"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
	"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	"image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
	}}
	RETURN_TYPES = ("CONDITIONING",)
	FUNCTION = "encode"

	CATEGORY = "advanced/conditioning"

	def encode(self, clip, clip_vision_output, prompt, image_interleave):
	tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
	return (clip.encode_from_tokens_scheduled(tokens), )

	class HunyuanImageToVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {"positive": ("CONDITIONING", ),
	"vae": ("VAE", ),
	"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
	"guidance_type": (["v1 (concat)", "v2 (replace)"], )
	},
	"optional": {"start_image": ("IMAGE", ),
	}}

	RETURN_TYPES = ("CONDITIONING", "LATENT")
	RETURN_NAMES = ("positive", "latent")
	FUNCTION = "encode"

	CATEGORY = "conditioning/video_models"

	def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
	latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
	out_latent = {}

	if start_image is not None:
	start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)

	concat_latent_image = vae.encode(start_image)
	mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
	mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0

	if guidance_type == "v1 (concat)":
	cond = {"concat_latent_image": concat_latent_image, "concat_mask": mask}
	else:
	cond = {'guiding_frame_index': 0}
	latent[:, :, :concat_latent_image.shape[2]] = concat_latent_image
	out_latent["noise_mask"] = mask

	positive = node_helpers.conditioning_set_values(positive, cond)

	out_latent["samples"] = latent
	return (positive, out_latent)



	NODE_CLASS_MAPPINGS = {
	"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
	"TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
	"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
	"HunyuanImageToVideo": HunyuanImageToVideo,
	}