alexnasa commited on
Commit
cb8b67b
Β·
verified Β·
1 Parent(s): 5232eee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -11
app.py CHANGED
@@ -34,7 +34,14 @@ from torchvision import transforms
34
  from models.controlnet import ControlNetModel
35
  from models.unet_2d_condition import UNet2DConditionModel
36
 
 
37
 
 
 
 
 
 
 
38
 
39
  def _generate_vlm_prompt(
40
  vlm_model: Qwen2_5_VLForConditionalGeneration,
@@ -107,25 +114,25 @@ snapshot_download(
107
 
108
 
109
  snapshot_download(
110
- repo_id="stabilityai/sd-turbo",
111
- local_dir="preset/models/sd-turbo"
112
  )
113
 
114
-
115
  snapshot_download(
116
  repo_id="xinyu1205/recognize_anything_model",
117
  local_dir="preset/models/"
118
  )
119
 
 
120
  # Load scheduler, tokenizer and models.
121
- pretrained_model_path = 'preset/models/sd-turbo'
122
  seesr_model_path = 'preset/models/seesr'
123
 
124
  scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
125
  text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
126
  tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
127
  vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
128
- # feature_extractor = CLIPImageProcessor.from_pretrained(f"{pretrained_model_path}/feature_extractor")
129
  unet = UNet2DConditionModel.from_pretrained(seesr_model_path, subfolder="unet")
130
  controlnet = ControlNetModel.from_pretrained(seesr_model_path, subfolder="controlnet")
131
 
@@ -185,9 +192,9 @@ def magnify(
185
  user_prompt = "",
186
  positive_prompt = "clean, high-resolution, 8k, best quality, masterpiece",
187
  negative_prompt = "dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
188
- num_inference_steps = 2,
189
  scale_factor = 4,
190
- cfg_scale = 1,
191
  seed = 123,
192
  latent_tiled_size = 320,
193
  latent_tiled_overlap = 4,
@@ -288,15 +295,15 @@ with gr.Blocks(css=css, theme=theme) as demo:
288
  input_image = gr.Image(type="pil", height=512)
289
  run_button = gr.Button("πŸ”Ž Magnify 4x", variant="primary")
290
  duration_time = gr.Text(label="duration time", value=60, visible=False)
291
- with gr.Accordion("Options"):
292
  user_prompt = gr.Textbox(label="User Prompt", value="")
293
  positive_prompt = gr.Textbox(label="Positive Prompt", value="clean, high-resolution, 8k, best quality, masterpiece")
294
  negative_prompt = gr.Textbox(
295
  label="Negative Prompt",
296
  value="dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"
297
  )
298
- cfg_scale = gr.Slider(label="Classifier Free Guidance Scale (Set to 1.0 in sd-turbo)", minimum=1, maximum=10, value=1, step=0)
299
- num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=100, value=2, step=1)
300
  seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=231)
301
  sample_times = gr.Slider(label="Sample Times", minimum=1, maximum=10, step=1, value=1)
302
  latent_tiled_size = gr.Slider(label="Diffusion Tile Size", minimum=128, maximum=480, value=320, step=1)
@@ -331,7 +338,7 @@ with gr.Blocks(css=css, theme=theme) as demo:
331
  inputs = [
332
  input_image,
333
  ]
334
- run_button.click(fn=magnify, inputs=[input_image,user_prompt,positive_prompt,negative_prompt,num_inference_steps, scale_factor, cfg_scale] , outputs=[result_gallery])
335
  input_image.upload(fn=preprocess_image,inputs=input_image, outputs=input_image)
336
 
337
  demo.launch(share=True)
 
34
  from models.controlnet import ControlNetModel
35
  from models.unet_2d_condition import UNet2DConditionModel
36
 
37
+ # VLM_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
38
 
39
+ # vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
40
+ # VLM_NAME,
41
+ # torch_dtype="auto",
42
+ # device_map="auto" # immediately dispatches layers onto available GPUs
43
+ # )
44
+ # vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
45
 
46
  def _generate_vlm_prompt(
47
  vlm_model: Qwen2_5_VLForConditionalGeneration,
 
114
 
115
 
116
  snapshot_download(
117
+ repo_id="stabilityai/stable-diffusion-2-1-base",
118
+ local_dir="preset/models/stable-diffusion-2-1-base"
119
  )
120
 
 
121
  snapshot_download(
122
  repo_id="xinyu1205/recognize_anything_model",
123
  local_dir="preset/models/"
124
  )
125
 
126
+
127
  # Load scheduler, tokenizer and models.
128
+ pretrained_model_path = 'preset/models/stable-diffusion-2-1-base'
129
  seesr_model_path = 'preset/models/seesr'
130
 
131
  scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
132
  text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
133
  tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
134
  vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
135
+ feature_extractor = CLIPImageProcessor.from_pretrained(f"{pretrained_model_path}/feature_extractor")
136
  unet = UNet2DConditionModel.from_pretrained(seesr_model_path, subfolder="unet")
137
  controlnet = ControlNetModel.from_pretrained(seesr_model_path, subfolder="controlnet")
138
 
 
192
  user_prompt = "",
193
  positive_prompt = "clean, high-resolution, 8k, best quality, masterpiece",
194
  negative_prompt = "dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
195
+ num_inference_steps = 50,
196
  scale_factor = 4,
197
+ cfg_scale = 7.5,
198
  seed = 123,
199
  latent_tiled_size = 320,
200
  latent_tiled_overlap = 4,
 
295
  input_image = gr.Image(type="pil", height=512)
296
  run_button = gr.Button("πŸ”Ž Magnify 4x", variant="primary")
297
  duration_time = gr.Text(label="duration time", value=60, visible=False)
298
+ with gr.Accordion("Options", visible=False):
299
  user_prompt = gr.Textbox(label="User Prompt", value="")
300
  positive_prompt = gr.Textbox(label="Positive Prompt", value="clean, high-resolution, 8k, best quality, masterpiece")
301
  negative_prompt = gr.Textbox(
302
  label="Negative Prompt",
303
  value="dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"
304
  )
305
+ cfg_scale = gr.Slider(label="Classifier Free Guidance Scale (Set to 1.0 in sd-turbo)", minimum=1, maximum=10, value=7.5, step=0)
306
+ num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=100, value=50, step=1)
307
  seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=231)
308
  sample_times = gr.Slider(label="Sample Times", minimum=1, maximum=10, step=1, value=1)
309
  latent_tiled_size = gr.Slider(label="Diffusion Tile Size", minimum=128, maximum=480, value=320, step=1)
 
338
  inputs = [
339
  input_image,
340
  ]
341
+ run_button.click(fn=magnify, inputs=input_image, outputs=[result_gallery])
342
  input_image.upload(fn=preprocess_image,inputs=input_image, outputs=input_image)
343
 
344
  demo.launch(share=True)