Spaces:
Runtime error
Runtime error
| # Copyright (c) 2023-2024 DeepSeek. | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy of | |
| # this software and associated documentation files (the "Software"), to deal in | |
| # the Software without restriction, including without limitation the rights to | |
| # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | |
| # the Software, and to permit persons to whom the Software is furnished to do so, | |
| # subject to the following conditions: | |
| # | |
| # The above copyright notice and this permission notice shall be included in all | |
| # copies or substantial portions of the Software. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | |
| # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | |
| # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | |
| # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
| # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
| from argparse import ArgumentParser | |
| from typing import List, Dict | |
| import torch | |
| from transformers import AutoModelForCausalLM | |
| import PIL.Image | |
| from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor | |
| from deepseek_vl2.serve.app_modules.utils import parse_ref_bbox | |
| def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]: | |
| """ | |
| Args: | |
| conversations (List[Dict[str, str]]): the conversations with a list of messages. An example is : | |
| [ | |
| { | |
| "role": "User", | |
| "content": "<image>\nExtract all information from this image and convert them into markdown format.", | |
| "images": ["./examples/table_datasets.png"] | |
| }, | |
| {"role": "Assistant", "content": ""}, | |
| ] | |
| Returns: | |
| pil_images (List[PIL.Image.Image]): the list of PIL images. | |
| """ | |
| pil_images = [] | |
| for message in conversations: | |
| if "images" not in message: | |
| continue | |
| for image_path in message["images"]: | |
| pil_img = PIL.Image.open(image_path) | |
| pil_img = pil_img.convert("RGB") | |
| pil_images.append(pil_img) | |
| return pil_images | |
| def main(args): | |
| dtype = torch.bfloat16 | |
| # specify the path to the model | |
| model_path = args.model_path | |
| vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path) | |
| tokenizer = vl_chat_processor.tokenizer | |
| vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| trust_remote_code=True, | |
| torch_dtype=dtype | |
| ) | |
| vl_gpt = vl_gpt.cuda().eval() | |
| # multiple images conversation example | |
| # Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations. | |
| conversation = [ | |
| { | |
| "role": "<|User|>", | |
| "content": "<image>\n<image>\n<|grounding|>In the first image, an object within the red rectangle is marked. Locate the object of the same category in the second image.", | |
| "images": [ | |
| "images/incontext_visual_grounding_1.jpeg", | |
| "images/icl_vg_2.jpeg" | |
| ], | |
| }, | |
| {"role": "<|Assistant|>", "content": ""}, | |
| ] | |
| # load images and prepare for inputs | |
| pil_images = load_pil_images(conversation) | |
| print(f"len(pil_images) = {len(pil_images)}") | |
| prepare_inputs = vl_chat_processor.__call__( | |
| conversations=conversation, | |
| images=pil_images, | |
| force_batchify=True, | |
| system_prompt="" | |
| ).to(vl_gpt.device, dtype=dtype) | |
| with torch.no_grad(): | |
| if args.chunk_size == -1: | |
| inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs) | |
| past_key_values = None | |
| else: | |
| # incremental_prefilling when using 40G GPU for vl2-small | |
| inputs_embeds, past_key_values = vl_gpt.incremental_prefilling( | |
| input_ids=prepare_inputs.input_ids, | |
| images=prepare_inputs.images, | |
| images_seq_mask=prepare_inputs.images_seq_mask, | |
| images_spatial_crop=prepare_inputs.images_spatial_crop, | |
| attention_mask=prepare_inputs.attention_mask, | |
| chunk_size=args.chunk_size | |
| ) | |
| # run the model to get the response | |
| outputs = vl_gpt.generate( | |
| # inputs_embeds=inputs_embeds[:, -1:], | |
| # input_ids=prepare_inputs.input_ids[:, -1:], | |
| inputs_embeds=inputs_embeds, | |
| input_ids=prepare_inputs.input_ids, | |
| images=prepare_inputs.images, | |
| images_seq_mask=prepare_inputs.images_seq_mask, | |
| images_spatial_crop=prepare_inputs.images_spatial_crop, | |
| attention_mask=prepare_inputs.attention_mask, | |
| past_key_values=past_key_values, | |
| pad_token_id=tokenizer.eos_token_id, | |
| bos_token_id=tokenizer.bos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| max_new_tokens=512, | |
| # do_sample=False, | |
| # repetition_penalty=1.1, | |
| do_sample=True, | |
| temperature=0.4, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| use_cache=True, | |
| ) | |
| answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False) | |
| print(f"{prepare_inputs['sft_format'][0]}", answer) | |
| vg_image = parse_ref_bbox(answer, image=pil_images[-1]) | |
| if vg_image is not None: | |
| vg_image.save("./vg.jpg", format="JPEG", quality=85) | |
| if __name__ == "__main__": | |
| parser = ArgumentParser() | |
| parser.add_argument("--model_path", type=str, required=True, | |
| default="deepseek-ai/deepseek-vl2", | |
| help="model name or local path to the model") | |
| parser.add_argument("--chunk_size", type=int, default=-1, | |
| help="chunk size for the model for prefiiling. " | |
| "When using 40G gpu for vl2-small, set a chunk_size for incremental_prefilling." | |
| "Otherwise, default value is -1, which means we do not use incremental_prefilling.") | |
| args = parser.parse_args() | |
| main(args) | |