wusize commited on
Commit
1858ad7
·
verified ·
1 Parent(s): 90172ea

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +51 -110
app.py CHANGED
@@ -53,7 +53,7 @@ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
53
  llm_config = config.llm
54
  llm_config['_attn_implementation'] = 'eager'
55
  harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
56
- model = AutoModel.from_pretrained(model_path, llm=llm_config,
57
  trust_remote_code=True).eval()
58
 
59
  special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
@@ -64,10 +64,9 @@ image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-
64
  print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
65
 
66
  if torch.cuda.is_available():
67
- model = model.to(torch.bfloat16).cuda()
68
  else:
69
- model = model.to(torch.float16)
70
-
71
 
72
 
73
  def expand2square(pil_img, background_color):
@@ -104,94 +103,35 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
104
  image = expand2square(
105
  image, (127, 127, 127))
106
  image = image.resize(size=(image_size, image_size))
107
- image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=cuda_device)
108
  image = rearrange(image, 'h w c -> c h w')[None]
109
  image = 2 * (image / 255) - 1
110
 
111
  prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
112
  assert '<image>' in prompt
113
- image_length = (image_size // 16) ** 2 + model.mar.buffer_size
114
  prompt = prompt.replace('<image>', '<image>' * image_length)
115
  input_ids = harmon_tokenizer.encode(
116
  prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
117
- _, z_enc = model.extract_visual_feature(model.encode(image))
118
- inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
119
  inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
120
- inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
121
  input_ids[input_ids != image_token_idx]
122
  )
123
- output = model.llm.generate(inputs_embeds=inputs_embeds,
124
- eos_token_id=harmon_tokenizer.eos_token_id,
125
- pad_token_id=harmon_tokenizer.pad_token_id
126
- if harmon_tokenizer.pad_token_id is not None else
127
- harmon_tokenizer.eos_token_id,
128
- max_new_tokens=max_new_tokens,
129
- do_sample=False if temperature == 0 else True,
130
- use_cache=True,
131
- temperature=temperature,
132
- top_p=top_p,
133
- )
134
- return harmon_tokenizer.decode(output[0], skip_special_tokens=True)
135
-
136
-
137
-
138
-
139
- def generate(input_ids,
140
- width,
141
- height,
142
- temperature: float = 1,
143
- parallel_size: int = 5,
144
- cfg_weight: float = 5,
145
- image_token_num_per_image: int = 576,
146
- patch_size: int = 16,
147
- progress=gr.Progress(track_tqdm=True)):
148
- # Clear CUDA cache before generating
149
- torch.cuda.empty_cache()
150
-
151
- tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
152
- for i in range(parallel_size * 2):
153
- tokens[i, :] = input_ids
154
- if i % 2 != 0:
155
- tokens[i, 1:-1] = vl_chat_processor.pad_id
156
- inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
157
- generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)
158
-
159
- pkv = None
160
- for i in range(image_token_num_per_image):
161
- with torch.no_grad():
162
- outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
163
- use_cache=True,
164
- past_key_values=pkv)
165
- pkv = outputs.past_key_values
166
- hidden_states = outputs.last_hidden_state
167
- logits = vl_gpt.gen_head(hidden_states[:, -1, :])
168
- logit_cond = logits[0::2, :]
169
- logit_uncond = logits[1::2, :]
170
- logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
171
- probs = torch.softmax(logits / temperature, dim=-1)
172
- next_token = torch.multinomial(probs, num_samples=1)
173
- generated_tokens[:, i] = next_token.squeeze(dim=-1)
174
- next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
175
-
176
- img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
177
- inputs_embeds = img_embeds.unsqueeze(dim=1)
178
-
179
-
180
-
181
- patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
182
- shape=[parallel_size, 8, width // patch_size, height // patch_size])
183
-
184
- return generated_tokens.to(dtype=torch.int), patches
185
-
186
- def unpack(dec, width, height, parallel_size=5):
187
- dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
188
- dec = np.clip((dec + 1) / 2 * 255, 0, 255)
189
-
190
- visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
191
- visual_img[:, :, :] = dec
192
-
193
- return visual_img
194
 
 
195
 
196
 
197
  @torch.inference_mode()
@@ -208,34 +148,35 @@ def generate_image(prompt,
208
  torch.manual_seed(seed)
209
  torch.cuda.manual_seed(seed)
210
  np.random.seed(seed)
211
- width = 384
212
- height = 384
213
- parallel_size = 4
214
-
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  with torch.no_grad():
216
- messages = [{'role': '<|User|>', 'content': prompt},
217
- {'role': '<|Assistant|>', 'content': ''}]
218
- text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
219
- sft_format=vl_chat_processor.sft_format,
220
- system_prompt='')
221
- text = text + vl_chat_processor.image_start_tag
222
-
223
- input_ids = torch.LongTensor(tokenizer.encode(text))
224
- output, patches = generate(input_ids,
225
- width // 16 * 16,
226
- height // 16 * 16,
227
- cfg_weight=guidance,
228
- parallel_size=parallel_size,
229
- temperature=t2i_temperature)
230
- images = unpack(patches,
231
- width // 16 * 16,
232
- height // 16 * 16,
233
- parallel_size=parallel_size)
234
-
235
- # return [Image.fromarray(images[i]).resize((768, 768), Image.LANCZOS) for i in range(parallel_size)]
236
- stime = time.time()
237
- ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
238
- print(f'upsample time: {time.time() - stime}')
239
  return ret_images
240
 
241
 
@@ -259,7 +200,7 @@ css = '''
259
  .gradio-container {max-width: 960px !important}
260
  '''
261
  with gr.Blocks(css=css) as demo:
262
- gr.Markdown("# Janus Pro 7B")
263
  with gr.Tab("Multimodal Understanding"):
264
  gr.Markdown(value="## Multimodal Understanding")
265
  image_input = gr.Image()
@@ -292,7 +233,7 @@ with gr.Blocks(css=css) as demo:
292
  with gr.Tab("Text-to-Image Generation"):
293
  gr.Markdown(value="## Text-to-Image Generation")
294
 
295
- prompt_input = gr.Textbox(label="Prompt. (Prompt in more detail can help produce better images!")
296
 
297
  generation_button = gr.Button("Generate Images")
298
 
@@ -300,7 +241,7 @@ with gr.Blocks(css=css) as demo:
300
 
301
  with gr.Accordion("Advanced options", open=False):
302
  with gr.Row():
303
- cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
304
  t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
305
  seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)
306
 
 
53
  llm_config = config.llm
54
  llm_config['_attn_implementation'] = 'eager'
55
  harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
56
+ harmon_model = AutoModel.from_pretrained(model_path, llm=llm_config,
57
  trust_remote_code=True).eval()
58
 
59
  special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
 
64
  print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
65
 
66
  if torch.cuda.is_available():
67
+ harmon_model = harmon_model.to(torch.bfloat16).cuda()
68
  else:
69
+ harmon_model = harmon_model.to(torch.float16)
 
70
 
71
 
72
  def expand2square(pil_img, background_color):
 
103
  image = expand2square(
104
  image, (127, 127, 127))
105
  image = image.resize(size=(image_size, image_size))
106
+ image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=cuda_device)
107
  image = rearrange(image, 'h w c -> c h w')[None]
108
  image = 2 * (image / 255) - 1
109
 
110
  prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
111
  assert '<image>' in prompt
112
+ image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
113
  prompt = prompt.replace('<image>', '<image>' * image_length)
114
  input_ids = harmon_tokenizer.encode(
115
  prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
116
+ _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
117
+ inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
118
  inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
119
+ inputs_embeds[input_ids != image_token_idx] = harmon_model.llm.get_input_embeddings()(
120
  input_ids[input_ids != image_token_idx]
121
  )
122
+ output = harmon_model.llm.generate(inputs_embeds=inputs_embeds,
123
+ eos_token_id=harmon_tokenizer.eos_token_id,
124
+ pad_token_id=harmon_tokenizer.pad_token_id
125
+ if harmon_tokenizer.pad_token_id is not None else
126
+ harmon_tokenizer.eos_token_id,
127
+ max_new_tokens=max_new_tokens,
128
+ do_sample=False, # if temperature == 0 else True,
129
+ use_cache=True,
130
+ # temperature=temperature,
131
+ # top_p=top_p
132
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ return harmon_tokenizer.decode(output[0], skip_special_tokens=True)
135
 
136
 
137
  @torch.inference_mode()
 
148
  torch.manual_seed(seed)
149
  torch.cuda.manual_seed(seed)
150
  np.random.seed(seed)
151
+
152
+ negative_prompt = 'Generate an image.'
153
+ repeat = 4
154
+ num_steps = 64
155
+ image_size = 512
156
+
157
+ assert image_size == 512
158
+ m = n = image_size // 16
159
+
160
+ prompts = [PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)] * repeat
161
+
162
+ if guidance != 1.0:
163
+ prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
164
+
165
+ inputs = harmon_tokenizer(
166
+ prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(cuda_device)
167
+
168
  with torch.no_grad():
169
+
170
+ images = harmon_model.sample(**inputs, num_iter=num_steps, cfg=guidance, cfg_schedule="constant",
171
+ temperature=temperature, progress=True, image_shape=(m, n))
172
+
173
+ images = rearrange(images, 'b c h w -> b h w c')
174
+
175
+ images = torch.clamp(
176
+ 127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
177
+
178
+ ret_images = [image_upsample(Image.fromarray(image)) for image in images]
179
+
 
 
 
 
 
 
 
 
 
 
 
 
180
  return ret_images
181
 
182
 
 
200
  .gradio-container {max-width: 960px !important}
201
  '''
202
  with gr.Blocks(css=css) as demo:
203
+ gr.Markdown("# Harmon 1.5B")
204
  with gr.Tab("Multimodal Understanding"):
205
  gr.Markdown(value="## Multimodal Understanding")
206
  image_input = gr.Image()
 
233
  with gr.Tab("Text-to-Image Generation"):
234
  gr.Markdown(value="## Text-to-Image Generation")
235
 
236
+ prompt_input = gr.Textbox(label="Prompt.")
237
 
238
  generation_button = gr.Button("Generate Images")
239
 
 
241
 
242
  with gr.Accordion("Advanced options", open=False):
243
  with gr.Row():
244
+ cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=3, step=0.5, label="CFG Weight")
245
  t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
246
  seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)
247