Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,9 @@ import os
|
|
5 |
import torch
|
6 |
import random
|
7 |
import subprocess
|
|
|
|
|
|
|
8 |
subprocess.run(
|
9 |
"pip install flash-attn --no-build-isolation",
|
10 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
@@ -26,6 +29,9 @@ from modeling.qwen2 import Qwen2Tokenizer
|
|
26 |
|
27 |
from huggingface_hub import snapshot_download
|
28 |
|
|
|
|
|
|
|
29 |
save_dir = "./model_weights"
|
30 |
repo_id = "ByteDance-Seed/BAGEL-7B-MoT"
|
31 |
cache_dir = save_dir + "/cache"
|
@@ -128,6 +134,58 @@ inferencer = InterleaveInferencer(
|
|
128 |
new_token_ids=new_token_ids,
|
129 |
)
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
def set_seed(seed):
|
132 |
"""Set random seeds for reproducibility"""
|
133 |
if seed > 0:
|
@@ -143,13 +201,16 @@ def set_seed(seed):
|
|
143 |
|
144 |
# Text to Image function with thinking option and hyperparameters
|
145 |
@spaces.GPU(duration=90)
|
146 |
-
def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=0.4,
|
147 |
timestep_shift=3.0, num_timesteps=50,
|
148 |
cfg_renorm_min=1.0, cfg_renorm_type="global",
|
149 |
max_think_token_n=1024, do_sample=False, text_temperature=0.3,
|
150 |
seed=0, image_ratio="1:1"):
|
151 |
# Set seed for reproducibility
|
152 |
set_seed(seed)
|
|
|
|
|
|
|
153 |
|
154 |
if image_ratio == "1:1":
|
155 |
image_shapes = (1024, 1024)
|
@@ -178,7 +239,7 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
|
|
178 |
|
179 |
result = {"text": "", "image": None}
|
180 |
# Call inferencer with or without think parameter based on user choice
|
181 |
-
for i in inferencer(text=
|
182 |
if type(i) == str:
|
183 |
result["text"] += i
|
184 |
else:
|
@@ -189,7 +250,7 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
|
|
189 |
|
190 |
# Image Understanding function with thinking option and hyperparameters
|
191 |
@spaces.GPU(duration=90)
|
192 |
-
def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
|
193 |
do_sample=False, text_temperature=0.3, max_new_tokens=512):
|
194 |
if image is None:
|
195 |
return "Please upload an image."
|
@@ -199,6 +260,9 @@ def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
|
|
199 |
|
200 |
image = pil_img2rgb(image)
|
201 |
|
|
|
|
|
|
|
202 |
# Set hyperparameters
|
203 |
inference_hyper = dict(
|
204 |
do_sample=do_sample,
|
@@ -208,7 +272,7 @@ def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
|
|
208 |
|
209 |
result = {"text": "", "image": None}
|
210 |
# Use show_thinking parameter to control thinking process
|
211 |
-
for i in inferencer(image=image, text=
|
212 |
understanding_output=True, **inference_hyper):
|
213 |
if type(i) == str:
|
214 |
result["text"] += i
|
@@ -219,7 +283,7 @@ def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
|
|
219 |
|
220 |
# Image Editing function with thinking option and hyperparameters
|
221 |
@spaces.GPU(duration=90)
|
222 |
-
def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
|
223 |
cfg_img_scale=2.0, cfg_interval=0.0,
|
224 |
timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
|
225 |
cfg_renorm_type="text_channel", max_think_token_n=1024,
|
@@ -235,6 +299,9 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
|
|
235 |
|
236 |
image = pil_img2rgb(image)
|
237 |
|
|
|
|
|
|
|
238 |
# Set hyperparameters
|
239 |
inference_hyper = dict(
|
240 |
max_think_token_n=max_think_token_n if show_thinking else 1024,
|
@@ -251,7 +318,7 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
|
|
251 |
|
252 |
# Include thinking parameter based on user choice
|
253 |
result = {"text": "", "image": None}
|
254 |
-
for i in inferencer(image=image, text=
|
255 |
if type(i) == str:
|
256 |
result["text"] += i
|
257 |
else:
|
@@ -267,22 +334,257 @@ def load_example_image(image_path):
|
|
267 |
print(f"Error loading example image: {e}")
|
268 |
return None
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
# Gradio UI
|
272 |
-
with gr.Blocks() as demo:
|
273 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
with gr.Tab("π Text to Image"):
|
276 |
txt_input = gr.Textbox(
|
277 |
label="Prompt",
|
278 |
-
value="A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere."
|
|
|
279 |
)
|
280 |
|
281 |
with gr.Row():
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
# Add hyperparameter controls in an accordion
|
285 |
-
with gr.Accordion("
|
286 |
# εζ°δΈζδΈ€δΈͺεΈε±
|
287 |
with gr.Group():
|
288 |
with gr.Row():
|
@@ -322,8 +624,8 @@ with gr.Blocks() as demo:
|
|
322 |
label="Temperature", info="Controls randomness in text generation")
|
323 |
|
324 |
thinking_output = gr.Textbox(label="Thinking Process", visible=False)
|
325 |
-
img_output = gr.Image(label="Generated Image")
|
326 |
-
gen_btn = gr.Button("Generate", variant="primary")
|
327 |
|
328 |
# Dynamically show/hide thinking process box and parameters
|
329 |
def update_thinking_visibility(show):
|
@@ -339,7 +641,7 @@ with gr.Blocks() as demo:
|
|
339 |
triggers=[gen_btn.click, txt_input.submit],
|
340 |
fn=text_to_image,
|
341 |
inputs=[
|
342 |
-
txt_input, show_thinking, cfg_text_scale,
|
343 |
cfg_interval, timestep_shift,
|
344 |
num_timesteps, cfg_renorm_min, cfg_renorm_type,
|
345 |
max_think_token_n, do_sample, text_temperature, seed, image_ratio
|
@@ -350,21 +652,27 @@ with gr.Blocks() as demo:
|
|
350 |
with gr.Tab("ποΈ Image Edit"):
|
351 |
with gr.Row():
|
352 |
with gr.Column(scale=1):
|
353 |
-
edit_image_input = gr.Image(label="Input Image", value=load_example_image('test_images/women.jpg'))
|
354 |
edit_prompt = gr.Textbox(
|
355 |
-
label="Prompt",
|
356 |
-
value="She boards a modern subway, quietly reading a folded newspaper, wearing the same clothes."
|
|
|
357 |
)
|
358 |
|
359 |
with gr.Column(scale=1):
|
360 |
-
edit_image_output = gr.Image(label="Result")
|
361 |
edit_thinking_output = gr.Textbox(label="Thinking Process", visible=False)
|
362 |
|
363 |
with gr.Row():
|
364 |
-
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
# Add hyperparameter controls in an accordion
|
367 |
-
with gr.Accordion("
|
368 |
with gr.Group():
|
369 |
with gr.Row():
|
370 |
edit_seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, interactive=True,
|
@@ -402,7 +710,7 @@ with gr.Blocks() as demo:
|
|
402 |
edit_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
|
403 |
label="Temperature", info="Controls randomness in text generation")
|
404 |
|
405 |
-
edit_btn = gr.Button("
|
406 |
|
407 |
# Dynamically show/hide thinking process box for editing
|
408 |
def update_edit_thinking_visibility(show):
|
@@ -418,7 +726,7 @@ with gr.Blocks() as demo:
|
|
418 |
triggers=[edit_btn.click, edit_prompt.submit],
|
419 |
fn=edit_image,
|
420 |
inputs=[
|
421 |
-
edit_image_input, edit_prompt, edit_show_thinking,
|
422 |
edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
|
423 |
edit_timestep_shift, edit_num_timesteps,
|
424 |
edit_cfg_renorm_min, edit_cfg_renorm_type,
|
@@ -430,20 +738,26 @@ with gr.Blocks() as demo:
|
|
430 |
with gr.Tab("πΌοΈ Image Understanding"):
|
431 |
with gr.Row():
|
432 |
with gr.Column(scale=1):
|
433 |
-
img_input = gr.Image(label="Input Image", value=load_example_image('test_images/meme.jpg'))
|
434 |
understand_prompt = gr.Textbox(
|
435 |
-
label="
|
436 |
-
value="Can someone explain what's funny about this meme??"
|
|
|
437 |
)
|
438 |
|
439 |
with gr.Column(scale=1):
|
440 |
-
txt_output = gr.Textbox(label="
|
441 |
|
442 |
with gr.Row():
|
443 |
-
|
|
|
|
|
|
|
|
|
|
|
444 |
|
445 |
# Add hyperparameter controls in an accordion
|
446 |
-
with gr.Accordion("
|
447 |
with gr.Row():
|
448 |
understand_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
|
449 |
understand_text_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, interactive=True,
|
@@ -451,20 +765,32 @@ with gr.Blocks() as demo:
|
|
451 |
understand_max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=512, step=64, interactive=True,
|
452 |
label="Max New Tokens", info="Maximum length of generated text, including potential thinking")
|
453 |
|
454 |
-
img_understand_btn = gr.Button("
|
455 |
|
456 |
gr.on(
|
457 |
triggers=[img_understand_btn.click, understand_prompt.submit],
|
458 |
fn=image_understanding,
|
459 |
inputs=[
|
460 |
-
img_input, understand_prompt, understand_show_thinking,
|
461 |
understand_do_sample, understand_text_temperature, understand_max_new_tokens
|
462 |
],
|
463 |
outputs=txt_output
|
464 |
)
|
465 |
|
466 |
-
gr.
|
467 |
-
"
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
demo.launch(share=True)
|
|
|
5 |
import torch
|
6 |
import random
|
7 |
import subprocess
|
8 |
+
import requests
|
9 |
+
import json
|
10 |
+
|
11 |
subprocess.run(
|
12 |
"pip install flash-attn --no-build-isolation",
|
13 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
|
|
29 |
|
30 |
from huggingface_hub import snapshot_download
|
31 |
|
32 |
+
# Get Brave Search API key
|
33 |
+
BSEARCH_API = os.getenv("BSEARCH_API")
|
34 |
+
|
35 |
save_dir = "./model_weights"
|
36 |
repo_id = "ByteDance-Seed/BAGEL-7B-MoT"
|
37 |
cache_dir = save_dir + "/cache"
|
|
|
134 |
new_token_ids=new_token_ids,
|
135 |
)
|
136 |
|
137 |
+
# Brave Search function
|
138 |
+
def brave_search(query):
|
139 |
+
"""Perform a web search using Brave Search API."""
|
140 |
+
if not BSEARCH_API:
|
141 |
+
return None
|
142 |
+
|
143 |
+
try:
|
144 |
+
headers = {
|
145 |
+
"Accept": "application/json",
|
146 |
+
"X-Subscription-Token": BSEARCH_API
|
147 |
+
}
|
148 |
+
|
149 |
+
url = "https://api.search.brave.com/res/v1/web/search"
|
150 |
+
params = {
|
151 |
+
"q": query,
|
152 |
+
"count": 5
|
153 |
+
}
|
154 |
+
|
155 |
+
response = requests.get(url, headers=headers, params=params)
|
156 |
+
response.raise_for_status()
|
157 |
+
|
158 |
+
data = response.json()
|
159 |
+
|
160 |
+
results = []
|
161 |
+
if "web" in data and "results" in data["web"]:
|
162 |
+
for idx, result in enumerate(data["web"]["results"][:5], 1):
|
163 |
+
title = result.get("title", "No title")
|
164 |
+
url = result.get("url", "")
|
165 |
+
description = result.get("description", "No description")
|
166 |
+
results.append(f"{idx}. {title}\nURL: {url}\n{description}")
|
167 |
+
|
168 |
+
if results:
|
169 |
+
return "\n\n".join(results)
|
170 |
+
else:
|
171 |
+
return None
|
172 |
+
|
173 |
+
except Exception as e:
|
174 |
+
print(f"Search error: {str(e)}")
|
175 |
+
return None
|
176 |
+
|
177 |
+
def enhance_prompt_with_search(prompt, use_search=False):
|
178 |
+
"""Enhance prompt with web search results if enabled."""
|
179 |
+
if not use_search or not BSEARCH_API:
|
180 |
+
return prompt
|
181 |
+
|
182 |
+
search_results = brave_search(prompt)
|
183 |
+
if search_results:
|
184 |
+
enhanced_prompt = f"{prompt}\n\n[Web Search Context]:\n{search_results}\n\n[Generate based on the above context and original prompt]"
|
185 |
+
return enhanced_prompt
|
186 |
+
|
187 |
+
return prompt
|
188 |
+
|
189 |
def set_seed(seed):
|
190 |
"""Set random seeds for reproducibility"""
|
191 |
if seed > 0:
|
|
|
201 |
|
202 |
# Text to Image function with thinking option and hyperparameters
|
203 |
@spaces.GPU(duration=90)
|
204 |
+
def text_to_image(prompt, use_web_search=False, show_thinking=False, cfg_text_scale=4.0, cfg_interval=0.4,
|
205 |
timestep_shift=3.0, num_timesteps=50,
|
206 |
cfg_renorm_min=1.0, cfg_renorm_type="global",
|
207 |
max_think_token_n=1024, do_sample=False, text_temperature=0.3,
|
208 |
seed=0, image_ratio="1:1"):
|
209 |
# Set seed for reproducibility
|
210 |
set_seed(seed)
|
211 |
+
|
212 |
+
# Enhance prompt with search if enabled
|
213 |
+
enhanced_prompt = enhance_prompt_with_search(prompt, use_web_search)
|
214 |
|
215 |
if image_ratio == "1:1":
|
216 |
image_shapes = (1024, 1024)
|
|
|
239 |
|
240 |
result = {"text": "", "image": None}
|
241 |
# Call inferencer with or without think parameter based on user choice
|
242 |
+
for i in inferencer(text=enhanced_prompt, think=show_thinking, understanding_output=False, **inference_hyper):
|
243 |
if type(i) == str:
|
244 |
result["text"] += i
|
245 |
else:
|
|
|
250 |
|
251 |
# Image Understanding function with thinking option and hyperparameters
|
252 |
@spaces.GPU(duration=90)
|
253 |
+
def image_understanding(image: Image.Image, prompt: str, use_web_search=False, show_thinking=False,
|
254 |
do_sample=False, text_temperature=0.3, max_new_tokens=512):
|
255 |
if image is None:
|
256 |
return "Please upload an image."
|
|
|
260 |
|
261 |
image = pil_img2rgb(image)
|
262 |
|
263 |
+
# Enhance prompt with search if enabled
|
264 |
+
enhanced_prompt = enhance_prompt_with_search(prompt, use_web_search)
|
265 |
+
|
266 |
# Set hyperparameters
|
267 |
inference_hyper = dict(
|
268 |
do_sample=do_sample,
|
|
|
272 |
|
273 |
result = {"text": "", "image": None}
|
274 |
# Use show_thinking parameter to control thinking process
|
275 |
+
for i in inferencer(image=image, text=enhanced_prompt, think=show_thinking,
|
276 |
understanding_output=True, **inference_hyper):
|
277 |
if type(i) == str:
|
278 |
result["text"] += i
|
|
|
283 |
|
284 |
# Image Editing function with thinking option and hyperparameters
|
285 |
@spaces.GPU(duration=90)
|
286 |
+
def edit_image(image: Image.Image, prompt: str, use_web_search=False, show_thinking=False, cfg_text_scale=4.0,
|
287 |
cfg_img_scale=2.0, cfg_interval=0.0,
|
288 |
timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
|
289 |
cfg_renorm_type="text_channel", max_think_token_n=1024,
|
|
|
299 |
|
300 |
image = pil_img2rgb(image)
|
301 |
|
302 |
+
# Enhance prompt with search if enabled
|
303 |
+
enhanced_prompt = enhance_prompt_with_search(prompt, use_web_search)
|
304 |
+
|
305 |
# Set hyperparameters
|
306 |
inference_hyper = dict(
|
307 |
max_think_token_n=max_think_token_n if show_thinking else 1024,
|
|
|
318 |
|
319 |
# Include thinking parameter based on user choice
|
320 |
result = {"text": "", "image": None}
|
321 |
+
for i in inferencer(image=image, text=enhanced_prompt, think=show_thinking, understanding_output=False, **inference_hyper):
|
322 |
if type(i) == str:
|
323 |
result["text"] += i
|
324 |
else:
|
|
|
334 |
print(f"Error loading example image: {e}")
|
335 |
return None
|
336 |
|
337 |
+
# Enhanced CSS for visual improvements
|
338 |
+
custom_css = """
|
339 |
+
/* Modern gradient background */
|
340 |
+
.gradio-container {
|
341 |
+
background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #3a6fb0 100%);
|
342 |
+
min-height: 100vh;
|
343 |
+
}
|
344 |
+
|
345 |
+
/* Main container with glassmorphism */
|
346 |
+
.container {
|
347 |
+
backdrop-filter: blur(10px);
|
348 |
+
background: rgba(255, 255, 255, 0.1);
|
349 |
+
border-radius: 20px;
|
350 |
+
padding: 30px;
|
351 |
+
margin: 20px auto;
|
352 |
+
max-width: 1400px;
|
353 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.2);
|
354 |
+
}
|
355 |
+
|
356 |
+
/* Header styling */
|
357 |
+
h1 {
|
358 |
+
background: linear-gradient(90deg, #ffffff 0%, #e0e0e0 100%);
|
359 |
+
-webkit-background-clip: text;
|
360 |
+
-webkit-text-fill-color: transparent;
|
361 |
+
font-size: 3.5em;
|
362 |
+
text-align: center;
|
363 |
+
margin-bottom: 30px;
|
364 |
+
font-weight: 800;
|
365 |
+
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3);
|
366 |
+
}
|
367 |
+
|
368 |
+
/* Tab styling */
|
369 |
+
.tabs {
|
370 |
+
background: rgba(255, 255, 255, 0.15);
|
371 |
+
border-radius: 15px;
|
372 |
+
padding: 10px;
|
373 |
+
margin-bottom: 20px;
|
374 |
+
}
|
375 |
+
|
376 |
+
.tab-nav {
|
377 |
+
background: rgba(255, 255, 255, 0.2) !important;
|
378 |
+
border-radius: 10px !important;
|
379 |
+
padding: 5px !important;
|
380 |
+
}
|
381 |
+
|
382 |
+
.tab-nav button {
|
383 |
+
background: transparent !important;
|
384 |
+
color: white !important;
|
385 |
+
border: none !important;
|
386 |
+
padding: 10px 20px !important;
|
387 |
+
margin: 0 5px !important;
|
388 |
+
border-radius: 8px !important;
|
389 |
+
font-weight: 600 !important;
|
390 |
+
transition: all 0.3s ease !important;
|
391 |
+
}
|
392 |
+
|
393 |
+
.tab-nav button.selected {
|
394 |
+
background: rgba(255, 255, 255, 0.3) !important;
|
395 |
+
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important;
|
396 |
+
}
|
397 |
+
|
398 |
+
.tab-nav button:hover {
|
399 |
+
background: rgba(255, 255, 255, 0.25) !important;
|
400 |
+
}
|
401 |
+
|
402 |
+
/* Input field styling */
|
403 |
+
.textbox, .image-container {
|
404 |
+
background: rgba(255, 255, 255, 0.95) !important;
|
405 |
+
border: 2px solid rgba(255, 255, 255, 0.3) !important;
|
406 |
+
border-radius: 12px !important;
|
407 |
+
padding: 15px !important;
|
408 |
+
color: #333 !important;
|
409 |
+
font-size: 16px !important;
|
410 |
+
transition: all 0.3s ease !important;
|
411 |
+
}
|
412 |
+
|
413 |
+
.textbox:focus {
|
414 |
+
border-color: #3a6fb0 !important;
|
415 |
+
box-shadow: 0 0 20px rgba(58, 111, 176, 0.4) !important;
|
416 |
+
}
|
417 |
+
|
418 |
+
/* Button styling */
|
419 |
+
.primary {
|
420 |
+
background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%) !important;
|
421 |
+
color: white !important;
|
422 |
+
border: none !important;
|
423 |
+
padding: 12px 30px !important;
|
424 |
+
border-radius: 10px !important;
|
425 |
+
font-weight: 600 !important;
|
426 |
+
font-size: 16px !important;
|
427 |
+
cursor: pointer !important;
|
428 |
+
transition: all 0.3s ease !important;
|
429 |
+
box-shadow: 0 4px 15px rgba(76, 175, 80, 0.3) !important;
|
430 |
+
}
|
431 |
+
|
432 |
+
.primary:hover {
|
433 |
+
transform: translateY(-2px) !important;
|
434 |
+
box-shadow: 0 6px 20px rgba(76, 175, 80, 0.4) !important;
|
435 |
+
}
|
436 |
+
|
437 |
+
/* Checkbox styling */
|
438 |
+
.checkbox-group {
|
439 |
+
background: rgba(255, 255, 255, 0.1) !important;
|
440 |
+
padding: 10px 15px !important;
|
441 |
+
border-radius: 8px !important;
|
442 |
+
margin: 10px 0 !important;
|
443 |
+
}
|
444 |
+
|
445 |
+
.checkbox-group label {
|
446 |
+
color: white !important;
|
447 |
+
font-weight: 500 !important;
|
448 |
+
}
|
449 |
+
|
450 |
+
/* Accordion styling */
|
451 |
+
.accordion {
|
452 |
+
background: rgba(255, 255, 255, 0.1) !important;
|
453 |
+
border-radius: 12px !important;
|
454 |
+
margin: 15px 0 !important;
|
455 |
+
border: 1px solid rgba(255, 255, 255, 0.2) !important;
|
456 |
+
}
|
457 |
+
|
458 |
+
.accordion-header {
|
459 |
+
background: rgba(255, 255, 255, 0.15) !important;
|
460 |
+
color: white !important;
|
461 |
+
padding: 12px 20px !important;
|
462 |
+
border-radius: 10px !important;
|
463 |
+
font-weight: 600 !important;
|
464 |
+
}
|
465 |
+
|
466 |
+
/* Slider styling */
|
467 |
+
.slider {
|
468 |
+
background: rgba(255, 255, 255, 0.2) !important;
|
469 |
+
border-radius: 5px !important;
|
470 |
+
}
|
471 |
+
|
472 |
+
.slider .handle {
|
473 |
+
background: white !important;
|
474 |
+
border: 3px solid #3a6fb0 !important;
|
475 |
+
}
|
476 |
+
|
477 |
+
/* Image output styling */
|
478 |
+
.image-frame {
|
479 |
+
border-radius: 15px !important;
|
480 |
+
overflow: hidden !important;
|
481 |
+
box-shadow: 0 8px 25px rgba(0, 0, 0, 0.3) !important;
|
482 |
+
background: rgba(255, 255, 255, 0.1) !important;
|
483 |
+
padding: 10px !important;
|
484 |
+
}
|
485 |
+
|
486 |
+
/* Footer links */
|
487 |
+
a {
|
488 |
+
color: #64b5f6 !important;
|
489 |
+
text-decoration: none !important;
|
490 |
+
font-weight: 500 !important;
|
491 |
+
transition: color 0.3s ease !important;
|
492 |
+
}
|
493 |
+
|
494 |
+
a:hover {
|
495 |
+
color: #90caf9 !important;
|
496 |
+
}
|
497 |
+
|
498 |
+
/* Web search info box */
|
499 |
+
.web-search-info {
|
500 |
+
background: linear-gradient(135deg, rgba(255, 193, 7, 0.2) 0%, rgba(255, 152, 0, 0.2) 100%);
|
501 |
+
border: 2px solid rgba(255, 193, 7, 0.5);
|
502 |
+
border-radius: 10px;
|
503 |
+
padding: 15px;
|
504 |
+
margin: 10px 0;
|
505 |
+
color: white;
|
506 |
+
}
|
507 |
+
|
508 |
+
.web-search-info h4 {
|
509 |
+
margin: 0 0 10px 0;
|
510 |
+
color: #ffd54f;
|
511 |
+
font-size: 1.2em;
|
512 |
+
}
|
513 |
+
|
514 |
+
.web-search-info p {
|
515 |
+
margin: 5px 0;
|
516 |
+
font-size: 0.95em;
|
517 |
+
line-height: 1.4;
|
518 |
+
}
|
519 |
+
|
520 |
+
/* Loading animation */
|
521 |
+
.generating {
|
522 |
+
border-color: #4CAF50 !important;
|
523 |
+
animation: pulse 2s infinite !important;
|
524 |
+
}
|
525 |
+
|
526 |
+
@keyframes pulse {
|
527 |
+
0% {
|
528 |
+
box-shadow: 0 0 0 0 rgba(76, 175, 80, 0.7);
|
529 |
+
}
|
530 |
+
70% {
|
531 |
+
box-shadow: 0 0 0 10px rgba(76, 175, 80, 0);
|
532 |
+
}
|
533 |
+
100% {
|
534 |
+
box-shadow: 0 0 0 0 rgba(76, 175, 80, 0);
|
535 |
+
}
|
536 |
+
}
|
537 |
+
"""
|
538 |
|
539 |
# Gradio UI
|
540 |
+
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
|
541 |
+
gr.HTML("""
|
542 |
+
<div class="container">
|
543 |
+
<h1>π₯― BAGEL - Bootstrapping Aligned Generation with Exponential Learning</h1>
|
544 |
+
<p style="text-align: center; color: #e0e0e0; font-size: 1.2em; margin-bottom: 30px;">
|
545 |
+
Advanced AI Model for Text-to-Image, Image Editing, and Image Understanding
|
546 |
+
</p>
|
547 |
+
</div>
|
548 |
+
""")
|
549 |
|
550 |
with gr.Tab("π Text to Image"):
|
551 |
txt_input = gr.Textbox(
|
552 |
label="Prompt",
|
553 |
+
value="A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere.",
|
554 |
+
lines=3
|
555 |
)
|
556 |
|
557 |
with gr.Row():
|
558 |
+
use_web_search = gr.Checkbox(
|
559 |
+
label="π Enable Web Search",
|
560 |
+
value=False,
|
561 |
+
info="Search the web for current information to enhance your prompt"
|
562 |
+
)
|
563 |
+
show_thinking = gr.Checkbox(label="π Show Thinking Process", value=False)
|
564 |
+
|
565 |
+
# Web Search Information Box
|
566 |
+
web_search_info = gr.HTML("""
|
567 |
+
<div class="web-search-info" style="display: none;">
|
568 |
+
<h4>π Brave Web Search Integration</h4>
|
569 |
+
<p>When enabled, BAGEL will search the web for relevant information about your prompt and incorporate current trends, references, and context into the image generation process.</p>
|
570 |
+
<p>This is particularly useful for:</p>
|
571 |
+
<ul style="margin-left: 20px;">
|
572 |
+
<li>β’ Current events and trending topics</li>
|
573 |
+
<li>β’ Specific art styles or references</li>
|
574 |
+
<li>β’ Technical or specialized subjects</li>
|
575 |
+
<li>β’ Pop culture references</li>
|
576 |
+
</ul>
|
577 |
+
</div>
|
578 |
+
""", visible=False)
|
579 |
+
|
580 |
+
# Show/hide web search info based on checkbox
|
581 |
+
def toggle_search_info(use_search):
|
582 |
+
return gr.update(visible=use_search)
|
583 |
+
|
584 |
+
use_web_search.change(toggle_search_info, inputs=[use_web_search], outputs=[web_search_info])
|
585 |
|
586 |
# Add hyperparameter controls in an accordion
|
587 |
+
with gr.Accordion("βοΈ Advanced Settings", open=False):
|
588 |
# εζ°δΈζδΈ€δΈͺεΈε±
|
589 |
with gr.Group():
|
590 |
with gr.Row():
|
|
|
624 |
label="Temperature", info="Controls randomness in text generation")
|
625 |
|
626 |
thinking_output = gr.Textbox(label="Thinking Process", visible=False)
|
627 |
+
img_output = gr.Image(label="Generated Image", elem_classes=["image-frame"])
|
628 |
+
gen_btn = gr.Button("π¨ Generate Image", variant="primary", size="lg")
|
629 |
|
630 |
# Dynamically show/hide thinking process box and parameters
|
631 |
def update_thinking_visibility(show):
|
|
|
641 |
triggers=[gen_btn.click, txt_input.submit],
|
642 |
fn=text_to_image,
|
643 |
inputs=[
|
644 |
+
txt_input, use_web_search, show_thinking, cfg_text_scale,
|
645 |
cfg_interval, timestep_shift,
|
646 |
num_timesteps, cfg_renorm_min, cfg_renorm_type,
|
647 |
max_think_token_n, do_sample, text_temperature, seed, image_ratio
|
|
|
652 |
with gr.Tab("ποΈ Image Edit"):
|
653 |
with gr.Row():
|
654 |
with gr.Column(scale=1):
|
655 |
+
edit_image_input = gr.Image(label="Input Image", value=load_example_image('test_images/women.jpg'), elem_classes=["image-frame"])
|
656 |
edit_prompt = gr.Textbox(
|
657 |
+
label="Edit Prompt",
|
658 |
+
value="She boards a modern subway, quietly reading a folded newspaper, wearing the same clothes.",
|
659 |
+
lines=2
|
660 |
)
|
661 |
|
662 |
with gr.Column(scale=1):
|
663 |
+
edit_image_output = gr.Image(label="Edited Result", elem_classes=["image-frame"])
|
664 |
edit_thinking_output = gr.Textbox(label="Thinking Process", visible=False)
|
665 |
|
666 |
with gr.Row():
|
667 |
+
edit_use_web_search = gr.Checkbox(
|
668 |
+
label="π Enable Web Search",
|
669 |
+
value=False,
|
670 |
+
info="Search for references and context to improve editing"
|
671 |
+
)
|
672 |
+
edit_show_thinking = gr.Checkbox(label="π Show Thinking Process", value=False)
|
673 |
|
674 |
# Add hyperparameter controls in an accordion
|
675 |
+
with gr.Accordion("βοΈ Advanced Settings", open=False):
|
676 |
with gr.Group():
|
677 |
with gr.Row():
|
678 |
edit_seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, interactive=True,
|
|
|
710 |
edit_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
|
711 |
label="Temperature", info="Controls randomness in text generation")
|
712 |
|
713 |
+
edit_btn = gr.Button("βοΈ Apply Edit", variant="primary", size="lg")
|
714 |
|
715 |
# Dynamically show/hide thinking process box for editing
|
716 |
def update_edit_thinking_visibility(show):
|
|
|
726 |
triggers=[edit_btn.click, edit_prompt.submit],
|
727 |
fn=edit_image,
|
728 |
inputs=[
|
729 |
+
edit_image_input, edit_prompt, edit_use_web_search, edit_show_thinking,
|
730 |
edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
|
731 |
edit_timestep_shift, edit_num_timesteps,
|
732 |
edit_cfg_renorm_min, edit_cfg_renorm_type,
|
|
|
738 |
with gr.Tab("πΌοΈ Image Understanding"):
|
739 |
with gr.Row():
|
740 |
with gr.Column(scale=1):
|
741 |
+
img_input = gr.Image(label="Input Image", value=load_example_image('test_images/meme.jpg'), elem_classes=["image-frame"])
|
742 |
understand_prompt = gr.Textbox(
|
743 |
+
label="Question",
|
744 |
+
value="Can someone explain what's funny about this meme??",
|
745 |
+
lines=2
|
746 |
)
|
747 |
|
748 |
with gr.Column(scale=1):
|
749 |
+
txt_output = gr.Textbox(label="AI Response", lines=20)
|
750 |
|
751 |
with gr.Row():
|
752 |
+
understand_use_web_search = gr.Checkbox(
|
753 |
+
label="π Enable Web Search",
|
754 |
+
value=False,
|
755 |
+
info="Search for context and references to better understand the image"
|
756 |
+
)
|
757 |
+
understand_show_thinking = gr.Checkbox(label="π Show Thinking Process", value=False)
|
758 |
|
759 |
# Add hyperparameter controls in an accordion
|
760 |
+
with gr.Accordion("βοΈ Advanced Settings", open=False):
|
761 |
with gr.Row():
|
762 |
understand_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
|
763 |
understand_text_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, interactive=True,
|
|
|
765 |
understand_max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=512, step=64, interactive=True,
|
766 |
label="Max New Tokens", info="Maximum length of generated text, including potential thinking")
|
767 |
|
768 |
+
img_understand_btn = gr.Button("π Analyze Image", variant="primary", size="lg")
|
769 |
|
770 |
gr.on(
|
771 |
triggers=[img_understand_btn.click, understand_prompt.submit],
|
772 |
fn=image_understanding,
|
773 |
inputs=[
|
774 |
+
img_input, understand_prompt, understand_use_web_search, understand_show_thinking,
|
775 |
understand_do_sample, understand_text_temperature, understand_max_new_tokens
|
776 |
],
|
777 |
outputs=txt_output
|
778 |
)
|
779 |
|
780 |
+
gr.HTML("""
|
781 |
+
<div style="text-align: center; margin-top: 40px; padding: 20px; background: rgba(255, 255, 255, 0.1); border-radius: 15px;">
|
782 |
+
<p style="color: #e0e0e0; font-size: 1.1em;">
|
783 |
+
π<a href="https://bagel-ai.org/" target="_blank">Website</a>
|
784 |
+
π<a href="https://arxiv.org/abs/2505.14683" target="_blank">Research Paper</a>
|
785 |
+
π€<a href="https://huggingface.co/ByteDance-Seed/BAGEL-7B-MoT" target="_blank">Model</a>
|
786 |
+
π<a href="https://demo.bagel-ai.org/" target="_blank">Demo</a>
|
787 |
+
π¬<a href="https://discord.gg/Z836xxzy" target="_blank">Discord</a>
|
788 |
+
π§<a href="mailto:[email protected]">Contact</a>
|
789 |
+
</p>
|
790 |
+
<p style="color: #ffd54f; margin-top: 15px; font-size: 0.95em;">
|
791 |
+
<strong>π Web Search:</strong> Powered by Brave Search API when BSEARCH_API environment variable is set
|
792 |
+
</p>
|
793 |
+
</div>
|
794 |
+
""")
|
795 |
|
796 |
demo.launch(share=True)
|