Yixin1234 commited on
Commit
9618b1a
·
1 Parent(s): 3f81199

fix dependency

Browse files
Files changed (2) hide show
  1. app.py +31 -26
  2. requirements.txt +4 -19
app.py CHANGED
@@ -1,39 +1,44 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, AutoModelForVision2Seq
 
3
  import torch
4
 
5
- class DeepSeekVL:
6
- def __init__(self, model_path="deepseek-ai/deepseek-vl-7b", device="cpu"):
7
- self.device = device
8
- self.processor = AutoProcessor.from_pretrained(model_path)
9
- self.model = AutoModelForVision2Seq.from_pretrained(
10
- model_path,
11
- torch_dtype=torch.float32
12
- ).to(device)
13
-
14
- def generate(self, image, question, max_new_tokens=128):
15
- inputs = self.processor(text=question, images=image, return_tensors="pt").to(self.device)
16
- with torch.no_grad():
17
- output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
18
- return self.processor.batch_decode(output_ids, skip_special_tokens=True)[0]
19
-
20
- # Initialize DeepSeek-VL model (CPU for free Spaces)
21
- model = DeepSeekVL(model_path="deepseek-ai/deepseek-vl-7b", device="cpu")
22
 
23
  def qa(image, question):
24
- # Run DeepSeek-VL inference: image + question -> answer
25
- return model.generate(image, question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  demo = gr.Interface(
28
  fn=qa,
29
- inputs=[
30
- gr.Image(type="pil", label="Upload Image"),
31
- gr.Textbox(label="Enter your question")
32
- ],
33
  outputs="text",
34
  title="DeepSeek-VL Multimodal QA Demo",
35
  description="Upload an image and enter a question. Experience DeepSeek-VL's vision-language capabilities."
36
  )
37
 
38
- if __name__ == "__main__":
39
- demo.launch()
 
1
  import gradio as gr
2
+ from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
3
+ from deepseek_vl.utils.io import load_pil_images
4
  import torch
5
 
6
+ model_path = "deepseek-ai/deepseek-vl-7b-chat"
7
+ vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
8
+ tokenizer = vl_chat_processor.tokenizer
9
+ vl_gpt = MultiModalityCausalLM.from_pretrained(model_path, trust_remote_code=True).to("cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def qa(image, question):
12
+ conversation = [
13
+ {"role": "User", "content": "<image_placeholder>" + question, "images": [image]},
14
+ {"role": "Assistant", "content": ""}
15
+ ]
16
+ pil_images = load_pil_images(conversation)
17
+ prepare_inputs = vl_chat_processor(
18
+ conversations=conversation,
19
+ images=pil_images,
20
+ force_batchify=True
21
+ ).to("cpu")
22
+ inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
23
+ outputs = vl_gpt.language_model.generate(
24
+ inputs_embeds=inputs_embeds,
25
+ attention_mask=prepare_inputs.attention_mask,
26
+ pad_token_id=tokenizer.eos_token_id,
27
+ bos_token_id=tokenizer.bos_token_id,
28
+ eos_token_id=tokenizer.eos_token_id,
29
+ max_new_tokens=512,
30
+ do_sample=False,
31
+ use_cache=True
32
+ )
33
+ answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
34
+ return answer
35
 
36
  demo = gr.Interface(
37
  fn=qa,
38
+ inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Enter your question")],
 
 
 
39
  outputs="text",
40
  title="DeepSeek-VL Multimodal QA Demo",
41
  description="Upload an image and enter a question. Experience DeepSeek-VL's vision-language capabilities."
42
  )
43
 
44
+ demo.launch()
 
requirements.txt CHANGED
@@ -1,19 +1,4 @@
1
- torch==2.0.1
2
- transformers>=4.38.2
3
- timm>=0.9.16
4
- accelerate
5
- sentencepiece
6
- attrdict
7
- einops
8
-
9
- # for gradio demo
10
- gradio==3.48.0
11
- gradio-client==0.6.1
12
- mdtex2html==1.3.0
13
- pypinyin==0.50.0
14
- tiktoken==0.5.2
15
- tqdm==4.64.0
16
- colorama==0.4.5
17
- Pygments==2.12.0
18
- markdown==3.4.1
19
- SentencePiece==0.1.96
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ git+https://github.com/deepseek-ai/DeepSeek-VL.git