File size: 14,472 Bytes
508b842
fa4c65b
 
 
44e58c2
508b842
fa4c65b
508b842
 
 
fa4c65b
 
44e58c2
fa4c65b
 
508b842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa4c65b
1dc498e
 
fa4c65b
508b842
 
fa4c65b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ea8d63
 
 
 
 
 
 
 
 
fa4c65b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ea8d63
 
 
fa4c65b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ea8d63
 
 
 
fa4c65b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508b842
 
 
 
 
 
eb1f8ea
508b842
b14c4f8
 
 
 
508b842
 
 
 
 
 
 
 
 
 
 
 
fa4c65b
 
 
 
 
508b842
 
 
 
 
fa4c65b
 
7cb87c9
508b842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dc498e
508b842
fa4c65b
508b842
fa4c65b
 
508b842
fa4c65b
508b842
 
 
 
 
 
 
 
 
 
 
 
fa4c65b
508b842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa4c65b
508b842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa4c65b
 
508b842
 
 
 
 
3ea8d63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa4c65b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import random
from typing import List, Union, Optional, Tuple
import torch
from PIL import Image
import spaces
import gradio as gr
from sample import (arg_parse,
                    sampling,
                    load_fontdiffuer_pipeline)

from batch_sample import batch_sampling

@spaces.GPU()
def run_fontdiffuer(source_image,
                    character,
                    reference_image,
                    sampling_step,
                    guidance_scale,
                    batch_size):
    args.character_input = False if source_image is not None else True
    args.content_character = character
    args.sampling_step = sampling_step
    args.guidance_scale = guidance_scale
    args.batch_size = batch_size
    args.seed = random.randint(0, 10000)
    out_image = sampling(
        args=args,
        pipe=pipe,
        content_image=source_image,
        style_image=reference_image)

    if out_image is not None:
        out_image.format = 'PNG'

    return out_image

def _normalize_batch_inputs(source_images, characters, reference_images) -> Tuple[List, List, List, int]:
    """
    Normalize different input types to consistent lists

    Returns:
        Tuple of (content_inputs, style_inputs, char_inputs, total_samples)
    """
    content_inputs = []
    style_inputs = []
    char_inputs = []

    # Handle character mode
    if source_images is None:
        if isinstance(characters, str):
            char_inputs = [characters]
        elif isinstance(characters, list):
            char_inputs = characters
        else:
            return [], [], [], 0

        # Replicate reference images to match character count
        if isinstance(reference_images, Image.Image):
            style_inputs = [reference_images] * len(char_inputs)
        elif isinstance(reference_images, list):
            if len(reference_images) == 1:
                style_inputs = reference_images * len(char_inputs)
            elif len(reference_images) == len(char_inputs):
                style_inputs = reference_images
            else:
                # Cycle through reference images if counts don't match
                style_inputs = [reference_images[i % len(reference_images)] for i in range(len(char_inputs))]

        total_samples = len(char_inputs)

    # Handle image mode
    else:
        if isinstance(source_images, Image.Image):
            content_inputs = [source_images]
        elif isinstance(source_images, list):
            # Handle Gradio Gallery format: list of tuples (image, caption)
            content_inputs = []
            for item in source_images:
                if isinstance(item, tuple) and len(item) >= 1:
                    # Extract the image from tuple (image, caption)
                    content_inputs.append(item[0])
                elif isinstance(item, Image.Image):
                    # Direct image
                    content_inputs.append(item)
        else:
            return [], [], [], 0

        # Handle reference images
        if isinstance(reference_images, Image.Image):
            style_inputs = [reference_images] * len(content_inputs)
        elif isinstance(reference_images, list):
            if len(reference_images) == 1:
                style_inputs = reference_images * len(content_inputs)
            elif len(reference_images) == len(content_inputs):
                style_inputs = reference_images
            else:
                # Cycle through reference images if counts don't match
                style_inputs = [reference_images[i % len(reference_images)] for i in range(len(content_inputs))]

        total_samples = len(content_inputs)

    return content_inputs, style_inputs, char_inputs, total_samples


@spaces.GPU()
def run_fontdiffuer_batch(source_images: Union[List[Image.Image], Image.Image, None],
                        #  characters: Union[List[str], str, None],
                        #  reference_images: Union[List[Image.Image], Image.Image],
                        reference_image: Image.Image,
                         sampling_step: int = 50,
                         guidance_scale: float = 7.5,
                         batch_size: int = 4,
                         seed: Optional[int] = None) -> List[Image.Image]:
    """
    Run FontDiffuser in batch mode

    Args:
        source_images: Single image, list of images, or None (for character mode)
        characters: Single character, list of characters, or None (for image mode)
        reference_images: Single style image or list of style images
        sampling_step: Number of sampling steps
        guidance_scale: Guidance scale for diffusion
        batch_size: Batch size for processing
        seed: Random seed (if None, generates random seed)

    Returns:
        List of generated images
    """

    args.adaptive_batch_size = True
    characters = None
    reference_images = [reference_image]

    # Normalize inputs to lists
    content_inputs, style_inputs, char_inputs, total_samples = _normalize_batch_inputs(
        source_images, characters, reference_images
    )

    if total_samples == 0:
        return []

    # Set up arguments
    args.character_input = source_images is None
    args.sampling_step = sampling_step
    args.guidance_scale = guidance_scale
    args.batch_size = min(batch_size, total_samples)  # Don't exceed available samples
    args.seed = seed if seed is not None else random.randint(0, 10000)

    print(f"Processing {total_samples} samples with batch size {args.batch_size}")

    # Use the enhanced batch_sampling function
    if args.character_input:
        # Character-based generation
        generated_images = batch_sampling(
            args=args,
            pipe=pipe,
            content_inputs=content_inputs,  # Empty for character mode
            style_inputs=style_inputs,
            content_characters=char_inputs
        )
    else:
        # Image-based generation
        generated_images = batch_sampling(
            args=args,
            pipe=pipe,
            content_inputs=content_inputs,
            style_inputs=style_inputs,
            content_characters=None
        )

    # Set format for all output images
    for img in generated_images:
        img.format = 'PNG'

    return generated_images


if __name__ == '__main__':
    args = arg_parse()
    args.demo = True
    args.ckpt_dir = 'ckpt'
    args.ttf_path = 'ttf/KaiXinSongA.ttf'
    args.device = 'cuda'

    args.max_batch_size = 64
    args.num_workers = 64
    args.adaptive_batch_size = True

    # load fontdiffuer pipeline
    pipe = load_fontdiffuer_pipeline(args=args)

    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("""
                    <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
                    <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
                        FontDiffuser
                    </h1>
                    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                        <a href="https://yeungchenwa.github.io/"">Zhenhua Yang</a>,
                        <a href="https://scholar.google.com/citations?user=6zNgcjAAAAAJ&hl=zh-CN&oi=ao"">Dezhi Peng</a>,
                        <a href="https://github.com/kyxscut"">Yuxin Kong</a>,
                        <a href="https://github.com/ZZXF11"">Yuyi Zhang</a>,
                        <a href="https://scholar.google.com/citations?user=IpmnLFcAAAAJ&hl=zh-CN&oi=ao"">Cong Yao</a>,
                        <a href="http://www.dlvc-lab.net/lianwen/Index.html"">Lianwen Jin</a>†
                    </h2>
                    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                        <strong>South China University of Technology</strong>, Alibaba DAMO Academy
                    </h2>
                    <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                    [<a href="https://arxiv.org/abs/2312.12142" style="color:blue;">arXiv</a>]
                    [<a href="https://yeungchenwa.github.io/fontdiffuser-homepage/" style="color:green;">Homepage</a>]
                    [<a href="https://github.com/yeungchenwa/FontDiffuser" style="color:green;">Github</a>]
                    </h3>
                    <h2 style="text-align: left; font-weight: 600; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
                    1.We propose FontDiffuser, which is capable to generate unseen characters and styles, and it can be extended to the cross-lingual generation, such as Chinese to Korean.
                    </h2>
                    <h2 style="text-align: left; font-weight: 600; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
                    2. FontDiffuser excels in generating complex character and handling large style variation. And it achieves state-of-the-art performance.
                    </h2>
                    </div>
                    """)
                gr.Image('figures/result_vis.png')
                gr.Image('figures/demo_tips.png')
            with gr.Column(scale=1):
                with gr.Row():
                    source_image = gr.Image(width=320, label='[Option 1] Source Image', image_mode='RGB', type='pil')
                    reference_image = gr.Image(width=320, label='Reference Image', image_mode='RGB', type='pil')
                with gr.Row():
                    character = gr.Textbox(value='隆', label='[Option 2] Source Character')
                with gr.Row():
                    fontdiffuer_output_image = gr.Image(height=200, label="FontDiffuser Output Image", image_mode='RGB', type='pil', format='png')

                sampling_step = gr.Slider(20, 50, value=20, step=10,
                                          label="Sampling Step", info="The sampling step by FontDiffuser.")
                guidance_scale = gr.Slider(1, 12, value=7.5, step=0.5,
                                           label="Scale of Classifier-free Guidance",
                                           info="The scale used for classifier-free guidance sampling")
                batch_size = gr.Slider(1, 4, value=1, step=1,
                                       label="Batch Size", info="The number of images to be sampled.")

                FontDiffuser = gr.Button('Run FontDiffuser')
                gr.Markdown("## <font color=#008000, size=6>Examples that You Can Choose Below⬇️</font>")
        with gr.Row():
            gr.Markdown("## Examples")
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("## Example 1️⃣: Source Image and Reference Image")
                gr.Markdown("### In this mode, we provide both the source image and \
                            the reference image for you to try our demo!")
                gr.Examples(
                    examples=[['figures/source_imgs/source_灨.jpg', 'figures/ref_imgs/ref_籍.jpg'],
                            ['figures/source_imgs/source_鑻.jpg', 'figures/ref_imgs/ref_鹰.jpg'],
                            ['figures/source_imgs/source_鑫.jpg', 'figures/ref_imgs/ref_壤.jpg'],
                            ['figures/source_imgs/source_釅.jpg', 'figures/ref_imgs/ref_雕.jpg']],
                    inputs=[source_image, reference_image]
                )
            with gr.Column(scale=1):
                gr.Markdown("## Example 2️⃣: Character and Reference Image")
                gr.Markdown("### In this mode, we provide the content character and the reference image \
                            for you to try our demo!")
                gr.Examples(
                    examples=[['龍', 'figures/ref_imgs/ref_鷢.jpg'],
                            ['轉', 'figures/ref_imgs/ref_鲸.jpg'],
                            ['懭', 'figures/ref_imgs/ref_籍_1.jpg'],
                            ['識', 'figures/ref_imgs/ref_鞣.jpg']],
                    inputs=[character, reference_image]
                )
            with gr.Column(scale=1):
                gr.Markdown("## Example 3️⃣: Reference Image")
                gr.Markdown("### In this mode, we provide only the reference image, \
                            you can upload your own source image or you choose the character above \
                            to try our demo!")
                gr.Examples(
                    examples=['figures/ref_imgs/ref_闡.jpg',
                            'figures/ref_imgs/ref_雕.jpg',
                            'figures/ref_imgs/ref_豄.jpg',
                            'figures/ref_imgs/ref_馨.jpg',
                            'figures/ref_imgs/ref_鲸.jpg',
                            'figures/ref_imgs/ref_檀.jpg',
                            'figures/ref_imgs/ref_鞣.jpg',
                            'figures/ref_imgs/ref_穗.jpg',
                            'figures/ref_imgs/ref_欟.jpg',
                            'figures/ref_imgs/ref_籍_1.jpg',
                            'figures/ref_imgs/ref_鷢.jpg',
                            'figures/ref_imgs/ref_媚.jpg',
                            'figures/ref_imgs/ref_籍.jpg',
                            'figures/ref_imgs/ref_壤.jpg',
                            'figures/ref_imgs/ref_蜓.jpg',
                            'figures/ref_imgs/ref_鹰.jpg'],
                    examples_per_page=20,
                    inputs=reference_image
                )
        FontDiffuser.click(
            fn=run_fontdiffuer,
            inputs=[source_image,
                    character,
                    reference_image,
                    sampling_step,
                    guidance_scale,
                    batch_size],
            outputs=fontdiffuer_output_image)
        
        # Batch Mode
        gr.Markdown("## Batch Mode")
        with gr.Row():
            input_images = gr.Gallery(
                format='png',
                file_types=['image'],
                type='pil',
            )

            reference_image = gr.Image(label='Reference Image', image_mode='RGB', type='pil')

            output_images = gr.Gallery(
                format='png',
                type='pil'
            )

        RunFontDiffuserBatch = gr.Button('Run FontDiffuser Batch Mode')
        RunFontDiffuserBatch.click(
            fn=run_fontdiffuer_batch,
            inputs=[input_images, reference_image],
            outputs=output_images
        )

    demo.launch(debug=True)