Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
@@ -53,7 +53,7 @@ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
|
53 |
llm_config = config.llm
|
54 |
llm_config['_attn_implementation'] = 'eager'
|
55 |
harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
56 |
-
|
57 |
trust_remote_code=True).eval()
|
58 |
|
59 |
special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
|
@@ -64,10 +64,9 @@ image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-
|
|
64 |
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
|
65 |
|
66 |
if torch.cuda.is_available():
|
67 |
-
|
68 |
else:
|
69 |
-
|
70 |
-
|
71 |
|
72 |
|
73 |
def expand2square(pil_img, background_color):
|
@@ -104,94 +103,35 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
|
|
104 |
image = expand2square(
|
105 |
image, (127, 127, 127))
|
106 |
image = image.resize(size=(image_size, image_size))
|
107 |
-
image = torch.from_numpy(np.array(image)).to(dtype=
|
108 |
image = rearrange(image, 'h w c -> c h w')[None]
|
109 |
image = 2 * (image / 255) - 1
|
110 |
|
111 |
prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
|
112 |
assert '<image>' in prompt
|
113 |
-
image_length = (image_size // 16) ** 2 +
|
114 |
prompt = prompt.replace('<image>', '<image>' * image_length)
|
115 |
input_ids = harmon_tokenizer.encode(
|
116 |
prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
|
117 |
-
_, z_enc =
|
118 |
-
inputs_embeds = z_enc.new_zeros(*input_ids.shape,
|
119 |
inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
|
120 |
-
inputs_embeds[input_ids != image_token_idx] =
|
121 |
input_ids[input_ids != image_token_idx]
|
122 |
)
|
123 |
-
output =
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
return harmon_tokenizer.decode(output[0], skip_special_tokens=True)
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
def generate(input_ids,
|
140 |
-
width,
|
141 |
-
height,
|
142 |
-
temperature: float = 1,
|
143 |
-
parallel_size: int = 5,
|
144 |
-
cfg_weight: float = 5,
|
145 |
-
image_token_num_per_image: int = 576,
|
146 |
-
patch_size: int = 16,
|
147 |
-
progress=gr.Progress(track_tqdm=True)):
|
148 |
-
# Clear CUDA cache before generating
|
149 |
-
torch.cuda.empty_cache()
|
150 |
-
|
151 |
-
tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
|
152 |
-
for i in range(parallel_size * 2):
|
153 |
-
tokens[i, :] = input_ids
|
154 |
-
if i % 2 != 0:
|
155 |
-
tokens[i, 1:-1] = vl_chat_processor.pad_id
|
156 |
-
inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
|
157 |
-
generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)
|
158 |
-
|
159 |
-
pkv = None
|
160 |
-
for i in range(image_token_num_per_image):
|
161 |
-
with torch.no_grad():
|
162 |
-
outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
|
163 |
-
use_cache=True,
|
164 |
-
past_key_values=pkv)
|
165 |
-
pkv = outputs.past_key_values
|
166 |
-
hidden_states = outputs.last_hidden_state
|
167 |
-
logits = vl_gpt.gen_head(hidden_states[:, -1, :])
|
168 |
-
logit_cond = logits[0::2, :]
|
169 |
-
logit_uncond = logits[1::2, :]
|
170 |
-
logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
|
171 |
-
probs = torch.softmax(logits / temperature, dim=-1)
|
172 |
-
next_token = torch.multinomial(probs, num_samples=1)
|
173 |
-
generated_tokens[:, i] = next_token.squeeze(dim=-1)
|
174 |
-
next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
|
175 |
-
|
176 |
-
img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
|
177 |
-
inputs_embeds = img_embeds.unsqueeze(dim=1)
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
|
182 |
-
shape=[parallel_size, 8, width // patch_size, height // patch_size])
|
183 |
-
|
184 |
-
return generated_tokens.to(dtype=torch.int), patches
|
185 |
-
|
186 |
-
def unpack(dec, width, height, parallel_size=5):
|
187 |
-
dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
|
188 |
-
dec = np.clip((dec + 1) / 2 * 255, 0, 255)
|
189 |
-
|
190 |
-
visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
|
191 |
-
visual_img[:, :, :] = dec
|
192 |
-
|
193 |
-
return visual_img
|
194 |
|
|
|
195 |
|
196 |
|
197 |
@torch.inference_mode()
|
@@ -208,34 +148,35 @@ def generate_image(prompt,
|
|
208 |
torch.manual_seed(seed)
|
209 |
torch.cuda.manual_seed(seed)
|
210 |
np.random.seed(seed)
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
with torch.no_grad():
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
cfg_weight=guidance,
|
228 |
-
parallel_size=parallel_size,
|
229 |
-
temperature=t2i_temperature)
|
230 |
-
images = unpack(patches,
|
231 |
-
width // 16 * 16,
|
232 |
-
height // 16 * 16,
|
233 |
-
parallel_size=parallel_size)
|
234 |
-
|
235 |
-
# return [Image.fromarray(images[i]).resize((768, 768), Image.LANCZOS) for i in range(parallel_size)]
|
236 |
-
stime = time.time()
|
237 |
-
ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
|
238 |
-
print(f'upsample time: {time.time() - stime}')
|
239 |
return ret_images
|
240 |
|
241 |
|
@@ -259,7 +200,7 @@ css = '''
|
|
259 |
.gradio-container {max-width: 960px !important}
|
260 |
'''
|
261 |
with gr.Blocks(css=css) as demo:
|
262 |
-
gr.Markdown("#
|
263 |
with gr.Tab("Multimodal Understanding"):
|
264 |
gr.Markdown(value="## Multimodal Understanding")
|
265 |
image_input = gr.Image()
|
@@ -292,7 +233,7 @@ with gr.Blocks(css=css) as demo:
|
|
292 |
with gr.Tab("Text-to-Image Generation"):
|
293 |
gr.Markdown(value="## Text-to-Image Generation")
|
294 |
|
295 |
-
prompt_input = gr.Textbox(label="Prompt.
|
296 |
|
297 |
generation_button = gr.Button("Generate Images")
|
298 |
|
@@ -300,7 +241,7 @@ with gr.Blocks(css=css) as demo:
|
|
300 |
|
301 |
with gr.Accordion("Advanced options", open=False):
|
302 |
with gr.Row():
|
303 |
-
cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=
|
304 |
t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
|
305 |
seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)
|
306 |
|
|
|
53 |
llm_config = config.llm
|
54 |
llm_config['_attn_implementation'] = 'eager'
|
55 |
harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
56 |
+
harmon_model = AutoModel.from_pretrained(model_path, llm=llm_config,
|
57 |
trust_remote_code=True).eval()
|
58 |
|
59 |
special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
|
|
|
64 |
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
|
65 |
|
66 |
if torch.cuda.is_available():
|
67 |
+
harmon_model = harmon_model.to(torch.bfloat16).cuda()
|
68 |
else:
|
69 |
+
harmon_model = harmon_model.to(torch.float16)
|
|
|
70 |
|
71 |
|
72 |
def expand2square(pil_img, background_color):
|
|
|
103 |
image = expand2square(
|
104 |
image, (127, 127, 127))
|
105 |
image = image.resize(size=(image_size, image_size))
|
106 |
+
image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=cuda_device)
|
107 |
image = rearrange(image, 'h w c -> c h w')[None]
|
108 |
image = 2 * (image / 255) - 1
|
109 |
|
110 |
prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
|
111 |
assert '<image>' in prompt
|
112 |
+
image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
|
113 |
prompt = prompt.replace('<image>', '<image>' * image_length)
|
114 |
input_ids = harmon_tokenizer.encode(
|
115 |
prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
|
116 |
+
_, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
|
117 |
+
inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
|
118 |
inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
|
119 |
+
inputs_embeds[input_ids != image_token_idx] = harmon_model.llm.get_input_embeddings()(
|
120 |
input_ids[input_ids != image_token_idx]
|
121 |
)
|
122 |
+
output = harmon_model.llm.generate(inputs_embeds=inputs_embeds,
|
123 |
+
eos_token_id=harmon_tokenizer.eos_token_id,
|
124 |
+
pad_token_id=harmon_tokenizer.pad_token_id
|
125 |
+
if harmon_tokenizer.pad_token_id is not None else
|
126 |
+
harmon_tokenizer.eos_token_id,
|
127 |
+
max_new_tokens=max_new_tokens,
|
128 |
+
do_sample=False, # if temperature == 0 else True,
|
129 |
+
use_cache=True,
|
130 |
+
# temperature=temperature,
|
131 |
+
# top_p=top_p
|
132 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
return harmon_tokenizer.decode(output[0], skip_special_tokens=True)
|
135 |
|
136 |
|
137 |
@torch.inference_mode()
|
|
|
148 |
torch.manual_seed(seed)
|
149 |
torch.cuda.manual_seed(seed)
|
150 |
np.random.seed(seed)
|
151 |
+
|
152 |
+
negative_prompt = 'Generate an image.'
|
153 |
+
repeat = 4
|
154 |
+
num_steps = 64
|
155 |
+
image_size = 512
|
156 |
+
|
157 |
+
assert image_size == 512
|
158 |
+
m = n = image_size // 16
|
159 |
+
|
160 |
+
prompts = [PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)] * repeat
|
161 |
+
|
162 |
+
if guidance != 1.0:
|
163 |
+
prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
|
164 |
+
|
165 |
+
inputs = harmon_tokenizer(
|
166 |
+
prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(cuda_device)
|
167 |
+
|
168 |
with torch.no_grad():
|
169 |
+
|
170 |
+
images = harmon_model.sample(**inputs, num_iter=num_steps, cfg=guidance, cfg_schedule="constant",
|
171 |
+
temperature=temperature, progress=True, image_shape=(m, n))
|
172 |
+
|
173 |
+
images = rearrange(images, 'b c h w -> b h w c')
|
174 |
+
|
175 |
+
images = torch.clamp(
|
176 |
+
127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
|
177 |
+
|
178 |
+
ret_images = [image_upsample(Image.fromarray(image)) for image in images]
|
179 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
return ret_images
|
181 |
|
182 |
|
|
|
200 |
.gradio-container {max-width: 960px !important}
|
201 |
'''
|
202 |
with gr.Blocks(css=css) as demo:
|
203 |
+
gr.Markdown("# Harmon 1.5B")
|
204 |
with gr.Tab("Multimodal Understanding"):
|
205 |
gr.Markdown(value="## Multimodal Understanding")
|
206 |
image_input = gr.Image()
|
|
|
233 |
with gr.Tab("Text-to-Image Generation"):
|
234 |
gr.Markdown(value="## Text-to-Image Generation")
|
235 |
|
236 |
+
prompt_input = gr.Textbox(label="Prompt.")
|
237 |
|
238 |
generation_button = gr.Button("Generate Images")
|
239 |
|
|
|
241 |
|
242 |
with gr.Accordion("Advanced options", open=False):
|
243 |
with gr.Row():
|
244 |
+
cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=3, step=0.5, label="CFG Weight")
|
245 |
t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
|
246 |
seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)
|
247 |
|