Spaces:

wusize
/

Harmon

Running on Zero

App Files Files Community

wusize commited on Mar 31

Commit

1858ad7

verified ·

1 Parent(s): 90172ea

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +51 -110

app.py CHANGED Viewed

@@ -53,7 +53,7 @@ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 llm_config = config.llm
 llm_config['_attn_implementation'] = 'eager'
 harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-model = AutoModel.from_pretrained(model_path, llm=llm_config,
                                          trust_remote_code=True).eval()
 special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
@@ -64,10 +64,9 @@ image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-
 print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
 if torch.cuda.is_available():
-    model = model.to(torch.bfloat16).cuda()
 else:
-    model = model.to(torch.float16)
 def expand2square(pil_img, background_color):
@@ -104,94 +103,35 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
     image = expand2square(
         image, (127, 127, 127))
     image = image.resize(size=(image_size, image_size))
-    image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=cuda_device)
     image = rearrange(image, 'h w c -> c h w')[None]
     image = 2 * (image / 255) - 1
     prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
     assert '<image>' in prompt
-    image_length = (image_size // 16) ** 2 + model.mar.buffer_size
     prompt = prompt.replace('<image>', '<image>' * image_length)
     input_ids = harmon_tokenizer.encode(
         prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
-    _, z_enc = model.extract_visual_feature(model.encode(image))
-    inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
     inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
-    inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
         input_ids[input_ids != image_token_idx]
     )
-    output = model.llm.generate(inputs_embeds=inputs_embeds,
-                                eos_token_id=harmon_tokenizer.eos_token_id,
-                                pad_token_id=harmon_tokenizer.pad_token_id
-                                if harmon_tokenizer.pad_token_id is not None else
-                                harmon_tokenizer.eos_token_id,
-                                max_new_tokens=max_new_tokens,
-                                do_sample=False if temperature == 0 else True,
-                                use_cache=True,
-                                temperature=temperature,
-                                top_p=top_p,
-                                )
-    return harmon_tokenizer.decode(output[0],  skip_special_tokens=True)
-def generate(input_ids,
-             width,
-             height,
-             temperature: float = 1,
-             parallel_size: int = 5,
-             cfg_weight: float = 5,
-             image_token_num_per_image: int = 576,
-             patch_size: int = 16,
-             progress=gr.Progress(track_tqdm=True)):
-    # Clear CUDA cache before generating
-    torch.cuda.empty_cache()
-    tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
-    for i in range(parallel_size * 2):
-        tokens[i, :] = input_ids
-        if i % 2 != 0:
-            tokens[i, 1:-1] = vl_chat_processor.pad_id
-    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
-    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)
-    pkv = None
-    for i in range(image_token_num_per_image):
-        with torch.no_grad():
-            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
-                                                use_cache=True,
-                                                past_key_values=pkv)
-            pkv = outputs.past_key_values
-            hidden_states = outputs.last_hidden_state
-            logits = vl_gpt.gen_head(hidden_states[:, -1, :])
-            logit_cond = logits[0::2, :]
-            logit_uncond = logits[1::2, :]
-            logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
-            probs = torch.softmax(logits / temperature, dim=-1)
-            next_token = torch.multinomial(probs, num_samples=1)
-            generated_tokens[:, i] = next_token.squeeze(dim=-1)
-            next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
-            img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
-            inputs_embeds = img_embeds.unsqueeze(dim=1)
-    patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
-                                                 shape=[parallel_size, 8, width // patch_size, height // patch_size])
-    return generated_tokens.to(dtype=torch.int), patches
-def unpack(dec, width, height, parallel_size=5):
-    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
-    dec = np.clip((dec + 1) / 2 * 255, 0, 255)
-    visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
-    visual_img[:, :, :] = dec
-    return visual_img
 @torch.inference_mode()
@@ -208,34 +148,35 @@ def generate_image(prompt,
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
         np.random.seed(seed)
-    width = 384
-    height = 384
-    parallel_size = 4
     with torch.no_grad():
-        messages = [{'role': '<|User|>', 'content': prompt},
-                    {'role': '<|Assistant|>', 'content': ''}]
-        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
-                                                                   sft_format=vl_chat_processor.sft_format,
-                                                                   system_prompt='')
-        text = text + vl_chat_processor.image_start_tag
-        input_ids = torch.LongTensor(tokenizer.encode(text))
-        output, patches = generate(input_ids,
-                                   width // 16 * 16,
-                                   height // 16 * 16,
-                                   cfg_weight=guidance,
-                                   parallel_size=parallel_size,
-                                   temperature=t2i_temperature)
-        images = unpack(patches,
-                        width // 16 * 16,
-                        height // 16 * 16,
-                        parallel_size=parallel_size)
-        # return [Image.fromarray(images[i]).resize((768, 768), Image.LANCZOS) for i in range(parallel_size)]
-        stime = time.time()
-        ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
-        print(f'upsample time: {time.time() - stime}')
         return ret_images
@@ -259,7 +200,7 @@ css = '''
 .gradio-container {max-width: 960px !important}
 '''
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Janus Pro 7B")
     with gr.Tab("Multimodal Understanding"):
         gr.Markdown(value="## Multimodal Understanding")
         image_input = gr.Image()
@@ -292,7 +233,7 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab("Text-to-Image Generation"):
         gr.Markdown(value="## Text-to-Image Generation")
-        prompt_input = gr.Textbox(label="Prompt. (Prompt in more detail can help produce better images!")
         generation_button = gr.Button("Generate Images")
@@ -300,7 +241,7 @@ with gr.Blocks(css=css) as demo:
         with gr.Accordion("Advanced options", open=False):
             with gr.Row():
-                cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
                 t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
             seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)

 llm_config = config.llm
 llm_config['_attn_implementation'] = 'eager'
 harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+harmon_model = AutoModel.from_pretrained(model_path, llm=llm_config,
                                          trust_remote_code=True).eval()
 special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
 print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
 if torch.cuda.is_available():
+    harmon_model = harmon_model.to(torch.bfloat16).cuda()
 else:
+    harmon_model = harmon_model.to(torch.float16)
 def expand2square(pil_img, background_color):
     image = expand2square(
         image, (127, 127, 127))
     image = image.resize(size=(image_size, image_size))
+    image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=cuda_device)
     image = rearrange(image, 'h w c -> c h w')[None]
     image = 2 * (image / 255) - 1
     prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
     assert '<image>' in prompt
+    image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
     prompt = prompt.replace('<image>', '<image>' * image_length)
     input_ids = harmon_tokenizer.encode(
         prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
+    _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
+    inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
     inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
+    inputs_embeds[input_ids != image_token_idx] = harmon_model.llm.get_input_embeddings()(
         input_ids[input_ids != image_token_idx]
     )
+    output = harmon_model.llm.generate(inputs_embeds=inputs_embeds,
+                                       eos_token_id=harmon_tokenizer.eos_token_id,
+                                       pad_token_id=harmon_tokenizer.pad_token_id
+                                       if harmon_tokenizer.pad_token_id is not None else
+                                       harmon_tokenizer.eos_token_id,
+                                       max_new_tokens=max_new_tokens,
+                                       do_sample=False,  # if temperature == 0 else True,
+                                       use_cache=True,
+                                       # temperature=temperature,
+                                       # top_p=top_p
+                                       )
+    return harmon_tokenizer.decode(output[0],  skip_special_tokens=True)
 @torch.inference_mode()
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
         np.random.seed(seed)
+    negative_prompt = 'Generate an image.'
+    repeat = 4
+    num_steps = 64
+    image_size = 512
+    assert image_size == 512
+    m = n = image_size // 16
+    prompts = [PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)] * repeat
+    if guidance != 1.0:
+        prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
+    inputs = harmon_tokenizer(
+        prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(cuda_device)
     with torch.no_grad():
+        images = harmon_model.sample(**inputs, num_iter=num_steps, cfg=guidance, cfg_schedule="constant",
+                                     temperature=temperature, progress=True, image_shape=(m, n))
+        images = rearrange(images, 'b c h w -> b h w c')
+        images = torch.clamp(
+            127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
+        ret_images = [image_upsample(Image.fromarray(image)) for image in images]
         return ret_images
 .gradio-container {max-width: 960px !important}
 '''
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Harmon 1.5B")
     with gr.Tab("Multimodal Understanding"):
         gr.Markdown(value="## Multimodal Understanding")
         image_input = gr.Image()
     with gr.Tab("Text-to-Image Generation"):
         gr.Markdown(value="## Text-to-Image Generation")
+        prompt_input = gr.Textbox(label="Prompt.")
         generation_button = gr.Button("Generate Images")
         with gr.Accordion("Advanced options", open=False):
             with gr.Row():
+                cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=3, step=0.5, label="CFG Weight")
                 t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
             seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)