File size: 5,707 Bytes
a84fd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724d9c8
a84fd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b93eec
a84fd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import numpy as np
import random

import spaces #[uncomment to use ZeroGPU]
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.generation import GenerationConfig


device = "cuda"
torch_dtype = torch.bfloat16
model_name_or_path = "X-Omni/X-Omni-Zh"
flux_model_name_or_path = "zhangxiaosong18/FLUX.1-dev-VAE"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, 
    torch_dtype=torch_dtype,
    trust_remote_code=True,
).cuda()
model.init_vision(flux_model_name_or_path)
model.set_generation_mode('image')
model.eval()

@spaces.GPU(duration=199) #[uncomment to use ZeroGPU]
def generate_image(
    image_prompt,
    image_size,
    top_p,
    min_p,
    seed,
):
    image_prompt = image_prompt.strip()
    image_size = tuple(map(int, image_size.split('x')))
    token_h, token_w = image_size[0] // 16, image_size[1] // 16
    image_prefix = f'<SOM>{token_h} {token_w}<IMAGE>'
    generation_config = GenerationConfig(
        max_new_tokens=token_h * token_w,
        do_sample=True,
        temperature=1.0,
        min_p=min_p,
        top_p=top_p,
        guidance_scale=1.0,
        suppress_tokens=tokenizer.convert_tokens_to_ids(model.config.mm_special_tokens),
    )

    tokens = tokenizer(
        [image_prompt + image_prefix],
        return_tensors='pt', 
        padding='longest', 
        padding_side='left',
    )
    input_ids = tokens.input_ids.cuda()
    attention_mask = tokens.attention_mask.cuda()
    
    torch.manual_seed(seed)
    tokens = model.generate(
        inputs=input_ids, 
        attention_mask=attention_mask,
        generation_config=generation_config,
    )
    torch.manual_seed(seed)
    _, images = model.mmdecode(tokenizer, tokens[0], skip_special_tokens=False)

    return images[0]


examples = [
    '''
生成一张雪中的紫禁城全景封面图,作为北京冬季旅游指南的主题。画面以近景构图展现建筑,红墙金瓦被皑皑白雪覆盖,朱红色宫墙,金黄色瓦片与洁白雪色形成强烈对比,琉璃瓦顶的积雪在阳光下折射出晶莹光泽。前景一枝腊梅花正在盛开,背景为灰蓝色冬日天空,飘落细雪,远处角楼轮廓若隐若现,增添朦胧诗意感。图片上有标题“雪落北平·穿越600年”,另有副标题“北京古建筑雪景深度游”。文字艺术感极强,与图片良好融合起来
    '''.strip(),
    '''
画面的中心摆放着一个复古花瓶,瓶身主体为浓郁的蓝色,这种蓝色深邃而典雅,仿佛带着岁月的沉淀。花瓶设计极具复古风格,瓶颈处环绕着细致的金色雕花,宛如华丽的项链点缀其上;瓶身绘制着精美的花卉图案,笔触细腻,色彩过渡自然,展现出极高的工艺水准,整体彰显出优雅的古典韵味。花瓶放置在深色木质的圆桌上,旁边搭配了一束新鲜绽放的百合花,为画面增添了几分生机与活力。背景是一幅淡蓝色的壁纸,上面有着若隐若现的花纹,营造出一种静谧而温馨的氛围。图片中的文字信息十分醒目。“家居美学盛典”位于顶部中央,字体较大,在视觉上十分突出,吸引观众的目光;左下角写着“下单直降 100”,下方紧跟数字“399”,强调了价格优惠;右下角有“限量抢购 速来咨询”的提示,引导观众进一步咨询;最底部中央,“前 50 名买一送一”的字样突出促销活动的紧迫性和吸引力。这些文字信息通过巧妙的颜色、大小和背景设计,在空间布局上层次分明,重点突出,有效地引导观众关注促销信息和价格优势。
    '''.strip(),
]
examples = [[prompt, '1152x1152', 1.0, 0.03, 0] for prompt in examples]


css = """
.app {
    max-width: 800px !important;
    margin: 0 auto !important;
}
"""

with gr.Blocks(css=css) as demo:
    gr.HTML('''
<h1 style="text-align:center">🎨X-Omni: Reinforcement Learning Makes Discrete Autoregressive Image Generative Models Great Again</h1>
<h3 style="text-align:center">Model: <a href="https://huggingface.co/X-Omni/X-Omni-Zh">X-Omni-Zh</a> (support Chinese text rendering)</h3>
<p align="center">
  <a href="https://x-omni-team.github.io">🏠 Project Page</a> |
  <a href="https://arxiv.org/pdf/2507.22058">📄 Paper</a> |
  <a href="https://github.com/X-Omni-Team/X-Omni">💻​ Code</a> |
  <a href="https://huggingface.co/collections/X-Omni/x-omni-models-6888aadcc54baad7997d7982">🤗 HuggingFace Model</a>
</p>
    '''.strip())
    with gr.Row():
        textbox = gr.Textbox(lines=2, placeholder='text prompt for image generation', show_label=False)
    image = gr.Image(show_label=False, type='pil')
    with gr.Row():
        button = gr.Button("Generate", variant="primary")
        with gr.Accordion("Advanced Settings", open=False):
            image_size = gr.Dropdown(label="Image Size", choices=["1152x1152", "1152x768", "768x1152"], value="1152x1152")
            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=1.0, step=0.01)
            min_p = gr.Slider(label="Min P", minimum=0.0, maximum=1.0, value=0.03, step=0.01)
            seed_input = gr.Number(label="Seed", value=0, precision=0)
    with gr.Row():
        gr.Examples(examples=examples, inputs=(textbox, image_size, top_p, min_p, seed_input), outputs=image, fn=generate_image, cache_examples=False, run_on_click=True)
    button.click(
        generate_image, 
        inputs=(textbox, image_size, top_p, min_p, seed_input), 
        outputs=image,
    )

if __name__ == "__main__":
    demo.launch(ssr_mode=False)