FontDiffuser-Gradio

Running on Zero

App Files Files Community

chulanpro5 commited on Jun 6

Commit

fa4c65b

1 Parent(s): 1dc498e

feat: batch_sampling

Browse files

Files changed (2) hide show

app.py +153 -21
batch_sample.py +604 -0

app.py CHANGED Viewed

@@ -1,13 +1,18 @@
 import random
 import spaces
 import gradio as gr
-from sample import (arg_parse,
                     sampling,
                     load_fontdiffuer_pipeline)
 @spaces.GPU()
-def run_fontdiffuer(source_image,
-                    character,
                     reference_image,
                     sampling_step,
                     guidance_scale,
@@ -23,12 +28,139 @@ def run_fontdiffuer(source_image,
         pipe=pipe,
         content_image=source_image,
         style_image=reference_image)
     if out_image is not None:
         out_image.format = 'PNG'
     return out_image
 if __name__ == '__main__':
     args = arg_parse()
@@ -49,18 +181,18 @@ if __name__ == '__main__':
                         FontDiffuser
                     </h1>
                     <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
-                        <a href="https://yeungchenwa.github.io/"">Zhenhua Yang</a>,
-                        <a href="https://scholar.google.com/citations?user=6zNgcjAAAAAJ&hl=zh-CN&oi=ao"">Dezhi Peng</a>,
-                        <a href="https://github.com/kyxscut"">Yuxin Kong</a>,
-                        <a href="https://github.com/ZZXF11"">Yuyi Zhang</a>,
-                        <a href="https://scholar.google.com/citations?user=IpmnLFcAAAAJ&hl=zh-CN&oi=ao"">Cong Yao</a>,
                         <a href="http://www.dlvc-lab.net/lianwen/Index.html"">Lianwen Jin</a>†
                     </h2>
                     <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                         <strong>South China University of Technology</strong>, Alibaba DAMO Academy
                     </h2>
-                    <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
-                    [<a href="https://arxiv.org/abs/2312.12142" style="color:blue;">arXiv</a>]
                     [<a href="https://yeungchenwa.github.io/fontdiffuser-homepage/" style="color:green;">Homepage</a>]
                     [<a href="https://github.com/yeungchenwa/FontDiffuser" style="color:green;">Github</a>]
                     </h3>
@@ -83,12 +215,12 @@ if __name__ == '__main__':
                 with gr.Row():
                     fontdiffuer_output_image = gr.Image(height=200, label="FontDiffuser Output Image", image_mode='RGB', type='pil', format='png')
-                sampling_step = gr.Slider(20, 50, value=20, step=10,
                                           label="Sampling Step", info="The sampling step by FontDiffuser.")
-                guidance_scale = gr.Slider(1, 12, value=7.5, step=0.5,
-                                           label="Scale of Classifier-free Guidance",
                                            info="The scale used for classifier-free guidance sampling")
-                batch_size = gr.Slider(1, 4, value=1, step=1,
                                        label="Batch Size", info="The number of images to be sampled.")
                 FontDiffuser = gr.Button('Run FontDiffuser')
@@ -101,7 +233,7 @@ if __name__ == '__main__':
                 gr.Markdown("### In this mode, we provide both the source image and \
                             the reference image for you to try our demo!")
                 gr.Examples(
-                    examples=[['figures/source_imgs/source_灨.jpg', 'figures/ref_imgs/ref_籍.jpg'],
                             ['figures/source_imgs/source_鑻.jpg', 'figures/ref_imgs/ref_鹰.jpg'],
                             ['figures/source_imgs/source_鑫.jpg', 'figures/ref_imgs/ref_壤.jpg'],
                             ['figures/source_imgs/source_釅.jpg', 'figures/ref_imgs/ref_雕.jpg']],
@@ -124,7 +256,7 @@ if __name__ == '__main__':
                             you can upload your own source image or you choose the character above \
                             to try our demo!")
                 gr.Examples(
-                    examples=['figures/ref_imgs/ref_闡.jpg',
                             'figures/ref_imgs/ref_雕.jpg',
                             'figures/ref_imgs/ref_豄.jpg',
                             'figures/ref_imgs/ref_馨.jpg',
@@ -145,11 +277,11 @@ if __name__ == '__main__':
                 )
         FontDiffuser.click(
             fn=run_fontdiffuer,
-            inputs=[source_image,
-                    character,
                     reference_image,
                     sampling_step,
                     guidance_scale,
                     batch_size],
             outputs=fontdiffuer_output_image)
-    demo.launch(debug=True)

 import random
+from typing import List, Union, Optional, Tuple
+import torch
+from PIL import Image
 import spaces
 import gradio as gr
+from sample import (arg_parse,
                     sampling,
                     load_fontdiffuer_pipeline)
+from batch_sample import batch_sampling
 @spaces.GPU()
+def run_fontdiffuer(source_image,
+                    character,
                     reference_image,
                     sampling_step,
                     guidance_scale,
         pipe=pipe,
         content_image=source_image,
         style_image=reference_image)
     if out_image is not None:
         out_image.format = 'PNG'
     return out_image
+def _normalize_batch_inputs(source_images, characters, reference_images) -> Tuple[List, List, List, int]:
+    """
+    Normalize different input types to consistent lists
+    Returns:
+        Tuple of (content_inputs, style_inputs, char_inputs, total_samples)
+    """
+    content_inputs = []
+    style_inputs = []
+    char_inputs = []
+    # Handle character mode
+    if source_images is None:
+        if isinstance(characters, str):
+            char_inputs = [characters]
+        elif isinstance(characters, list):
+            char_inputs = characters
+        else:
+            return [], [], [], 0
+        # Replicate reference images to match character count
+        if isinstance(reference_images, Image.Image):
+            style_inputs = [reference_images] * len(char_inputs)
+        elif isinstance(reference_images, list):
+            if len(reference_images) == 1:
+                style_inputs = reference_images * len(char_inputs)
+            elif len(reference_images) == len(char_inputs):
+                style_inputs = reference_images
+            else:
+                # Cycle through reference images if counts don't match
+                style_inputs = [reference_images[i % len(reference_images)] for i in range(len(char_inputs))]
+        total_samples = len(char_inputs)
+    # Handle image mode
+    else:
+        if isinstance(source_images, Image.Image):
+            content_inputs = [source_images]
+        elif isinstance(source_images, list):
+            content_inputs = source_images
+        else:
+            return [], [], [], 0
+        # Handle reference images
+        if isinstance(reference_images, Image.Image):
+            style_inputs = [reference_images] * len(content_inputs)
+        elif isinstance(reference_images, list):
+            if len(reference_images) == 1:
+                style_inputs = reference_images * len(content_inputs)
+            elif len(reference_images) == len(content_inputs):
+                style_inputs = reference_images
+            else:
+                # Cycle through reference images if counts don't match
+                style_inputs = [reference_images[i % len(reference_images)] for i in range(len(content_inputs))]
+        total_samples = len(content_inputs)
+    return content_inputs, style_inputs, char_inputs, total_samples
+@spaces.GPU()
+def run_fontdiffuer_batch(source_images: Union[List[Image.Image], Image.Image, None],
+                         characters: Union[List[str], str, None],
+                         reference_images: Union[List[Image.Image], Image.Image],
+                         sampling_step: int = 50,
+                         guidance_scale: float = 7.5,
+                         batch_size: int = 4,
+                         seed: Optional[int] = None) -> List[Image.Image]:
+    """
+    Run FontDiffuser in batch mode
+    Args:
+        source_images: Single image, list of images, or None (for character mode)
+        characters: Single character, list of characters, or None (for image mode)
+        reference_images: Single style image or list of style images
+        sampling_step: Number of sampling steps
+        guidance_scale: Guidance scale for diffusion
+        batch_size: Batch size for processing
+        seed: Random seed (if None, generates random seed)
+    Returns:
+        List of generated images
+    """
+    # Normalize inputs to lists
+    content_inputs, style_inputs, char_inputs, total_samples = _normalize_batch_inputs(
+        source_images, characters, reference_images
+    )
+    if total_samples == 0:
+        return []
+    # Set up arguments
+    args.character_input = source_images is None
+    args.sampling_step = sampling_step
+    args.guidance_scale = guidance_scale
+    args.batch_size = min(batch_size, total_samples)  # Don't exceed available samples
+    args.seed = seed if seed is not None else random.randint(0, 10000)
+    print(f"Processing {total_samples} samples with batch size {args.batch_size}")
+    # Use the enhanced batch_sampling function
+    if args.character_input:
+        # Character-based generation
+        generated_images = batch_sampling(
+            args=args,
+            pipe=pipe,
+            content_inputs=content_inputs,  # Empty for character mode
+            style_inputs=style_inputs,
+            content_characters=char_inputs
+        )
+    else:
+        # Image-based generation
+        generated_images = batch_sampling(
+            args=args,
+            pipe=pipe,
+            content_inputs=content_inputs,
+            style_inputs=style_inputs,
+            content_characters=None
+        )
+    # Set format for all output images
+    for img in generated_images:
+        img.format = 'PNG'
+    return generated_images
 if __name__ == '__main__':
     args = arg_parse()
                         FontDiffuser
                     </h1>
                     <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+                        <a href="https://yeungchenwa.github.io/"">Zhenhua Yang</a>,
+                        <a href="https://scholar.google.com/citations?user=6zNgcjAAAAAJ&hl=zh-CN&oi=ao"">Dezhi Peng</a>,
+                        <a href="https://github.com/kyxscut"">Yuxin Kong</a>,
+                        <a href="https://github.com/ZZXF11"">Yuyi Zhang</a>,
+                        <a href="https://scholar.google.com/citations?user=IpmnLFcAAAAJ&hl=zh-CN&oi=ao"">Cong Yao</a>,
                         <a href="http://www.dlvc-lab.net/lianwen/Index.html"">Lianwen Jin</a>†
                     </h2>
                     <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                         <strong>South China University of Technology</strong>, Alibaba DAMO Academy
                     </h2>
+                    <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+                    [<a href="https://arxiv.org/abs/2312.12142" style="color:blue;">arXiv</a>]
                     [<a href="https://yeungchenwa.github.io/fontdiffuser-homepage/" style="color:green;">Homepage</a>]
                     [<a href="https://github.com/yeungchenwa/FontDiffuser" style="color:green;">Github</a>]
                     </h3>
                 with gr.Row():
                     fontdiffuer_output_image = gr.Image(height=200, label="FontDiffuser Output Image", image_mode='RGB', type='pil', format='png')
+                sampling_step = gr.Slider(20, 50, value=20, step=10,
                                           label="Sampling Step", info="The sampling step by FontDiffuser.")
+                guidance_scale = gr.Slider(1, 12, value=7.5, step=0.5,
+                                           label="Scale of Classifier-free Guidance",
                                            info="The scale used for classifier-free guidance sampling")
+                batch_size = gr.Slider(1, 4, value=1, step=1,
                                        label="Batch Size", info="The number of images to be sampled.")
                 FontDiffuser = gr.Button('Run FontDiffuser')
                 gr.Markdown("### In this mode, we provide both the source image and \
                             the reference image for you to try our demo!")
                 gr.Examples(
+                    examples=[['figures/source_imgs/source_灨.jpg', 'figures/ref_imgs/ref_籍.jpg'],
                             ['figures/source_imgs/source_鑻.jpg', 'figures/ref_imgs/ref_鹰.jpg'],
                             ['figures/source_imgs/source_鑫.jpg', 'figures/ref_imgs/ref_壤.jpg'],
                             ['figures/source_imgs/source_釅.jpg', 'figures/ref_imgs/ref_雕.jpg']],
                             you can upload your own source image or you choose the character above \
                             to try our demo!")
                 gr.Examples(
+                    examples=['figures/ref_imgs/ref_闡.jpg',
                             'figures/ref_imgs/ref_雕.jpg',
                             'figures/ref_imgs/ref_豄.jpg',
                             'figures/ref_imgs/ref_馨.jpg',
                 )
         FontDiffuser.click(
             fn=run_fontdiffuer,
+            inputs=[source_image,
+                    character,
                     reference_image,
                     sampling_step,
                     guidance_scale,
                     batch_size],
             outputs=fontdiffuer_output_image)
+    demo.launch(debug=True)

batch_sample.py ADDED Viewed

	@@ -0,0 +1,604 @@

+import os
+import time
+from PIL import Image
+from typing import List, Tuple, Optional, Union
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+import torch
+import torchvision.transforms as transforms
+from accelerate.utils import set_seed
+from src import (
+    FontDiffuserDPMPipeline,
+    FontDiffuserModelDPM,
+    build_ddpm_scheduler,
+    build_unet,
+    build_content_encoder,
+    build_style_encoder,
+)
+from utils import (
+    ttf2im,
+    load_ttf,
+    is_char_in_font,
+    save_args_to_yaml,
+    save_single_image,
+    save_image_with_content_style,
+)
+class BatchProcessor:
+    """Handles batch processing logic for FontDiffuser"""
+    def __init__(self, args):
+        self.args = args
+        self.device = args.device
+        self.max_batch_size = getattr(args, "max_batch_size", 8)
+        self.num_workers = getattr(args, "num_workers", 4)
+    def batch_image_process(
+        self,
+        content_inputs: List[Union[str, Image.Image]],
+        style_inputs: List[Union[str, Image.Image]],
+        content_characters: Optional[List[str]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[Optional[Image.Image]]]:
+        """
+        Process multiple images in batch
+        Args:
+            content_inputs: List of content image paths or PIL Images
+            style_inputs: List of style image paths or PIL Images
+            content_characters: List of characters if using character input mode
+        Returns:
+            Tuple of (content_tensors, style_tensors, content_pil_images)
+        """
+        batch_size = len(content_inputs)
+        assert len(style_inputs) == batch_size, (
+            "Content and style inputs must have same length"
+        )
+        if content_characters:
+            assert len(content_characters) == batch_size, (
+                "Content characters must match batch size"
+            )
+        # Transform setup
+        content_inference_transforms = transforms.Compose(
+            [
+                transforms.Resize(
+                    self.args.content_image_size,
+                    interpolation=transforms.InterpolationMode.BILINEAR,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        style_inference_transforms = transforms.Compose(
+            [
+                transforms.Resize(
+                    self.args.style_image_size,
+                    interpolation=transforms.InterpolationMode.BILINEAR,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        content_tensors = []
+        style_tensors = []
+        content_pil_images = []
+        # Process in parallel using ThreadPoolExecutor for I/O operations
+        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
+            # Submit content processing tasks
+            content_futures = []
+            for i, content_input in enumerate(content_inputs):
+                if content_characters and i < len(content_characters):
+                    future = executor.submit(
+                        self._process_content_character,
+                        content_characters[i],
+                        content_inference_transforms,
+                    )
+                else:
+                    future = executor.submit(
+                        self._process_content_image,
+                        content_input,
+                        content_inference_transforms,
+                    )
+                content_futures.append(future)
+            # Submit style processing tasks
+            style_futures = []
+            for style_input in style_inputs:
+                future = executor.submit(
+                    self._process_style_image, style_input, style_inference_transforms
+                )
+                style_futures.append(future)
+            # Collect results
+            for future in as_completed(content_futures):
+                try:
+                    content_tensor, content_pil = future.result()
+                    if content_tensor is not None:
+                        content_tensors.append(content_tensor)
+                        content_pil_images.append(content_pil)
+                except Exception as e:
+                    print(f"Error processing content: {e}")
+                    continue
+            for future in as_completed(style_futures):
+                try:
+                    style_tensor = future.result()
+                    if style_tensor is not None:
+                        style_tensors.append(style_tensor)
+                except Exception as e:
+                    print(f"Error processing style: {e}")
+                    continue
+        # Stack tensors into batches
+        if content_tensors and style_tensors:
+            content_batch = torch.stack(content_tensors)
+            style_batch = torch.stack(style_tensors)
+            return content_batch, style_batch, content_pil_images
+        else:
+            return None, None, []
+    def _process_content_character(
+        self, character: str, transform
+    ) -> Tuple[Optional[torch.Tensor], Optional[Image.Image]]:
+        """Process content character into tensor"""
+        if not is_char_in_font(font_path=self.args.ttf_path, char=character):
+            print(f"Character '{character}' not found in font")
+            return None, None
+        font = load_ttf(ttf_path=self.args.ttf_path)
+        content_image = ttf2im(font=font, char=character)
+        content_image_pil = content_image.copy()
+        content_tensor = transform(content_image)
+        return content_tensor, content_image_pil
+    def _process_content_image(
+        self, image_input: Union[str, Image.Image], transform
+    ) -> Tuple[Optional[torch.Tensor], None]:
+        """Process content image into tensor"""
+        try:
+            if isinstance(image_input, str):
+                content_image = Image.open(image_input).convert("RGB")
+            else:
+                content_image = image_input.convert("RGB")
+            content_tensor = transform(content_image)
+            return content_tensor, None
+        except Exception as e:
+            print(f"Error processing content image: {e}")
+            return None, None
+    def _process_style_image(
+        self, image_input: Union[str, Image.Image], transform
+    ) -> Optional[torch.Tensor]:
+        """Process style image into tensor"""
+        try:
+            if isinstance(image_input, str):
+                style_image = Image.open(image_input).convert("RGB")
+            else:
+                style_image = image_input.convert("RGB")
+            style_tensor = transform(style_image)
+            return style_tensor
+        except Exception as e:
+            print(f"Error processing style image: {e}")
+            return None
+def arg_parse():
+    from configs.fontdiffuser import get_parser
+    parser = get_parser()
+    parser.add_argument("--ckpt_dir", type=str, default=None)
+    parser.add_argument("--demo", action="store_true")
+    parser.add_argument(
+        "--controlnet",
+        type=bool,
+        default=False,
+        help="If in demo mode, the controlnet can be added.",
+    )
+    parser.add_argument("--character_input", action="store_true")
+    parser.add_argument("--content_character", type=str, default=None)
+    parser.add_argument("--content_image_path", type=str, default=None)
+    parser.add_argument("--style_image_path", type=str, default=None)
+    parser.add_argument("--save_image", action="store_true")
+    parser.add_argument(
+        "--save_image_dir", type=str, default=None, help="The saving directory."
+    )
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument("--ttf_path", type=str, default="ttf/KaiXinSongA.ttf")
+    # Batch processing arguments
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=4,
+        help="Batch size for processing multiple images",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=8,
+        help="Maximum batch size based on GPU memory",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=4,
+        help="Number of workers for parallel image loading",
+    )
+    parser.add_argument(
+        "--batch_content_paths",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of content image paths for batch processing",
+    )
+    parser.add_argument(
+        "--batch_style_paths",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of style image paths for batch processing",
+    )
+    parser.add_argument(
+        "--batch_characters",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of characters for batch processing",
+    )
+    parser.add_argument(
+        "--adaptive_batch_size",
+        action="store_true",
+        help="Automatically adjust batch size based on GPU memory",
+    )
+    args = parser.parse_args()
+    style_image_size = args.style_image_size
+    content_image_size = args.content_image_size
+    args.style_image_size = (style_image_size, style_image_size)
+    args.content_image_size = (content_image_size, content_image_size)
+    return args
+def get_optimal_batch_size(args) -> int:
+    """Determine optimal batch size based on GPU memory"""
+    if not torch.cuda.is_available():
+        return 1
+    # Get GPU memory info
+    gpu_memory = torch.cuda.get_device_properties(args.device).total_memory / (
+        1024**3
+    )  # GB
+    # Estimate batch size based on GPU memory (rough heuristic)
+    if gpu_memory >= 24:  # RTX 4090, A100, etc.
+        optimal_batch = min(16, args.max_batch_size)
+    elif gpu_memory >= 12:  # RTX 3080 Ti, RTX 4070 Ti, etc.
+        optimal_batch = min(8, args.max_batch_size)
+    elif gpu_memory >= 8:  # RTX 3070, RTX 4060 Ti, etc.
+        optimal_batch = min(4, args.max_batch_size)
+    else:  # Lower end GPUs
+        optimal_batch = min(2, args.max_batch_size)
+    return optimal_batch
+def load_fontdiffuer_pipeline(args):
+    """Load FontDiffuser pipeline (unchanged from original)"""
+    # Load the model state_dict
+    unet = build_unet(args=args)
+    unet.load_state_dict(torch.load(f"{args.ckpt_dir}/unet.pth"))
+    style_encoder = build_style_encoder(args=args)
+    style_encoder.load_state_dict(torch.load(f"{args.ckpt_dir}/style_encoder.pth"))
+    content_encoder = build_content_encoder(args=args)
+    content_encoder.load_state_dict(torch.load(f"{args.ckpt_dir}/content_encoder.pth"))
+    model = FontDiffuserModelDPM(
+        unet=unet, style_encoder=style_encoder, content_encoder=content_encoder
+    )
+    model.to(args.device)
+    print("Loaded the model state_dict successfully!")
+    # Load the training ddpm_scheduler.
+    train_scheduler = build_ddpm_scheduler(args=args)
+    print("Loaded training DDPM scheduler sucessfully!")
+    # Load the DPM_Solver to generate the sample.
+    pipe = FontDiffuserDPMPipeline(
+        model=model,
+        ddpm_train_scheduler=train_scheduler,
+        model_type=args.model_type,
+        guidance_type=args.guidance_type,
+        guidance_scale=args.guidance_scale,
+    )
+    print("Loaded dpm_solver pipeline sucessfully!")
+    return pipe
+def batch_sampling(
+    args,
+    pipe,
+    content_inputs: List[Union[str, Image.Image]],
+    style_inputs: List[Union[str, Image.Image]],
+    content_characters: Optional[List[str]] = None,
+) -> List[Image.Image]:
+    """
+    Perform batch sampling with FontDiffuser
+    Args:
+        args: Arguments
+        pipe: FontDiffuser pipeline
+        content_inputs: List of content images/paths
+        style_inputs: List of style images/paths
+        content_characters: List of characters (if using character input)
+    Returns:
+        List of generated images
+    """
+    if not args.demo:
+        os.makedirs(args.save_image_dir, exist_ok=True)
+        save_args_to_yaml(
+            args=args, output_file=f"{args.save_image_dir}/sampling_config.yaml"
+        )
+    if args.seed:
+        set_seed(seed=args.seed)
+    # Determine optimal batch size
+    if args.adaptive_batch_size:
+        optimal_batch_size = get_optimal_batch_size(args)
+        print(f"Using adaptive batch size: {optimal_batch_size}")
+    else:
+        optimal_batch_size = args.batch_size
+    batch_processor = BatchProcessor(args)
+    total_samples = len(content_inputs)
+    all_generated_images = []
+    print(f"Processing {total_samples} samples in batches of {optimal_batch_size}")
+    # Process in batches
+    for batch_start in range(0, total_samples, optimal_batch_size):
+        batch_end = min(batch_start + optimal_batch_size, total_samples)
+        batch_content = content_inputs[batch_start:batch_end]
+        batch_style = style_inputs[batch_start:batch_end]
+        batch_chars = (
+            content_characters[batch_start:batch_end] if content_characters else None
+        )
+        print(
+            f"Processing batch {batch_start // optimal_batch_size + 1}/{(total_samples + optimal_batch_size - 1) // optimal_batch_size}"
+        )
+        # Process batch
+        content_batch, style_batch, content_pil_images = (
+            batch_processor.batch_image_process(batch_content, batch_style, batch_chars)
+        )
+        if content_batch is None or style_batch is None:
+            print("Skipping batch due to processing errors")
+            continue
+        current_batch_size = content_batch.shape[0]
+        with torch.no_grad():
+            content_batch = content_batch.to(args.device)
+            style_batch = style_batch.to(args.device)
+            print(f"Generating {current_batch_size} images with DPM-Solver++...")
+            start_time = time.time()
+            try:
+                # Generate batch
+                images = pipe.generate(
+                    content_images=content_batch,
+                    style_images=style_batch,
+                    batch_size=current_batch_size,
+                    order=args.order,
+                    num_inference_step=args.num_inference_steps,
+                    content_encoder_downsample_size=args.content_encoder_downsample_size,
+                    t_start=args.t_start,
+                    t_end=args.t_end,
+                    dm_size=args.content_image_size,
+                    algorithm_type=args.algorithm_type,
+                    skip_type=args.skip_type,
+                    method=args.method,
+                    correcting_x0_fn=args.correcting_x0_fn,
+                )
+                end_time = time.time()
+                print(f"Batch generation completed in {end_time - start_time:.2f}s")
+                # Save images if requested
+                if args.save_image:
+                    save_batch_images(
+                        args,
+                        images,
+                        content_pil_images,
+                        batch_content,
+                        batch_style,
+                        batch_start,
+                    )
+                all_generated_images.extend(images)
+            except RuntimeError as e:
+                if "out of memory" in str(e).lower():
+                    print(
+                        f"GPU out of memory with batch size {current_batch_size}, trying smaller batch..."
+                    )
+                    torch.cuda.empty_cache()
+                    # Retry with smaller batch
+                    smaller_batch_size = max(1, current_batch_size // 2)
+                    for sub_batch_start in range(
+                        0, current_batch_size, smaller_batch_size
+                    ):
+                        sub_batch_end = min(
+                            sub_batch_start + smaller_batch_size, current_batch_size
+                        )
+                        sub_content = content_batch[sub_batch_start:sub_batch_end]
+                        sub_style = style_batch[sub_batch_start:sub_batch_end]
+                        sub_images = pipe.generate(
+                            content_images=sub_content,
+                            style_images=sub_style,
+                            batch_size=sub_batch_end - sub_batch_start,
+                            order=args.order,
+                            num_inference_step=args.num_inference_steps,
+                            content_encoder_downsample_size=args.content_encoder_downsample_size,
+                            t_start=args.t_start,
+                            t_end=args.t_end,
+                            dm_size=args.content_image_size,
+                            algorithm_type=args.algorithm_type,
+                            skip_type=args.skip_type,
+                            method=args.method,
+                            correcting_x0_fn=args.correcting_x0_fn,
+                        )
+                        all_generated_images.extend(sub_images)
+                else:
+                    print(f"Error during generation: {e}")
+                    continue
+        # Clear GPU cache between batches
+        torch.cuda.empty_cache()
+    print(f"Batch processing completed! Generated {len(all_generated_images)} images.")
+    return all_generated_images
+def save_batch_images(
+    args, images, content_pil_images, batch_content, batch_style, batch_offset
+):
+    """Save batch of generated images"""
+    for i, image in enumerate(images):
+        # Create unique filename for each image
+        image_idx = batch_offset + i
+        save_single_image(
+            save_dir=args.save_image_dir, image=image, suffix=f"_{image_idx:04d}"
+        )
+        # Save with content and style context if available
+        if args.character_input and i < len(content_pil_images):
+            save_image_with_content_style(
+                save_dir=args.save_image_dir,
+                image=image,
+                content_image_pil=content_pil_images[i],
+                content_image_path=None,
+                style_image_path=batch_style[i]
+                if isinstance(batch_style[i], str)
+                else None,
+                resolution=args.resolution,
+                suffix=f"_{image_idx:04d}",
+            )
+        elif not args.character_input:
+            save_image_with_content_style(
+                save_dir=args.save_image_dir,
+                image=image,
+                content_image_pil=None,
+                content_image_path=batch_content[i]
+                if isinstance(batch_content[i], str)
+                else None,
+                style_image_path=batch_style[i]
+                if isinstance(batch_style[i], str)
+                else None,
+                resolution=args.resolution,
+                suffix=f"_{image_idx:04d}",
+            )
+def sampling(args, pipe, content_image=None, style_image=None):
+    """Original single image sampling function (for backward compatibility)"""
+    if not args.demo:
+        os.makedirs(args.save_image_dir, exist_ok=True)
+        save_args_to_yaml(
+            args=args, output_file=f"{args.save_image_dir}/sampling_config.yaml"
+        )
+    if args.seed:
+        set_seed(seed=args.seed)
+    # Use single image processing
+    if args.character_input:
+        content_inputs = (
+            [args.content_character] if hasattr(args, "content_character") else ["A"]
+        )
+        style_inputs = [style_image or args.style_image_path]
+        result = batch_sampling(args, pipe, [], style_inputs, content_inputs)
+    else:
+        content_inputs = [content_image or args.content_image_path]
+        style_inputs = [style_image or args.style_image_path]
+        result = batch_sampling(args, pipe, content_inputs, style_inputs)
+    return result[0] if result else None
+# Additional utility functions for batch processing
+def load_images_from_directory(
+    directory_path: str, extensions: List[str] = [".jpg", ".jpeg", ".png", ".bmp"]
+) -> List[str]:
+    """Load all image paths from a directory"""
+    directory = Path(directory_path)
+    image_paths = []
+    for ext in extensions:
+        image_paths.extend(directory.glob(f"*{ext}"))
+        image_paths.extend(directory.glob(f"*{ext.upper()}"))
+    return [str(path) for path in sorted(image_paths)]
+def create_batch_from_config(
+    config_file: str,
+) -> Tuple[List[str], List[str], List[str]]:
+    """Create batch inputs from configuration file"""
+    import json
+    with open(config_file, "r") as f:
+        config = json.load(f)
+    content_inputs = config.get("content_images", [])
+    style_inputs = config.get("style_images", [])
+    characters = config.get("characters", [])
+    return content_inputs, style_inputs, characters
+if __name__ == "__main__":
+    args = arg_parse()
+    # Load fontdiffuser pipeline
+    pipe = load_fontdiffuer_pipeline(args=args)
+    # Check if batch processing is requested
+    if args.batch_content_paths or args.batch_style_paths or args.batch_characters:
+        # Batch processing mode
+        content_inputs = args.batch_content_paths or []
+        style_inputs = args.batch_style_paths or []
+        characters = args.batch_characters or None
+        if characters and args.character_input:
+            # Character-based batch processing
+            style_inputs = style_inputs or [args.style_image_path] * len(characters)
+            generated_images = batch_sampling(args, pipe, [], style_inputs, characters)
+        else:
+            # Image-based batch processing
+            if len(content_inputs) != len(style_inputs):
+                print("Error: Number of content and style images must match")
+                exit(1)
+            generated_images = batch_sampling(args, pipe, content_inputs, style_inputs)
+        print(f"Batch processing completed! Generated {len(generated_images)} images.")
+    else:
+        # Single image processing (original behavior)
+        out_image = sampling(args=args, pipe=pipe)