fix dependency
Browse files- app.py +31 -26
- requirements.txt +4 -19
app.py
CHANGED
@@ -1,39 +1,44 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
|
|
3 |
import torch
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
self.model = AutoModelForVision2Seq.from_pretrained(
|
10 |
-
model_path,
|
11 |
-
torch_dtype=torch.float32
|
12 |
-
).to(device)
|
13 |
-
|
14 |
-
def generate(self, image, question, max_new_tokens=128):
|
15 |
-
inputs = self.processor(text=question, images=image, return_tensors="pt").to(self.device)
|
16 |
-
with torch.no_grad():
|
17 |
-
output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
18 |
-
return self.processor.batch_decode(output_ids, skip_special_tokens=True)[0]
|
19 |
-
|
20 |
-
# Initialize DeepSeek-VL model (CPU for free Spaces)
|
21 |
-
model = DeepSeekVL(model_path="deepseek-ai/deepseek-vl-7b", device="cpu")
|
22 |
|
23 |
def qa(image, question):
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
demo = gr.Interface(
|
28 |
fn=qa,
|
29 |
-
inputs=[
|
30 |
-
gr.Image(type="pil", label="Upload Image"),
|
31 |
-
gr.Textbox(label="Enter your question")
|
32 |
-
],
|
33 |
outputs="text",
|
34 |
title="DeepSeek-VL Multimodal QA Demo",
|
35 |
description="Upload an image and enter a question. Experience DeepSeek-VL's vision-language capabilities."
|
36 |
)
|
37 |
|
38 |
-
|
39 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
|
3 |
+
from deepseek_vl.utils.io import load_pil_images
|
4 |
import torch
|
5 |
|
6 |
+
model_path = "deepseek-ai/deepseek-vl-7b-chat"
|
7 |
+
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
|
8 |
+
tokenizer = vl_chat_processor.tokenizer
|
9 |
+
vl_gpt = MultiModalityCausalLM.from_pretrained(model_path, trust_remote_code=True).to("cpu")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def qa(image, question):
|
12 |
+
conversation = [
|
13 |
+
{"role": "User", "content": "<image_placeholder>" + question, "images": [image]},
|
14 |
+
{"role": "Assistant", "content": ""}
|
15 |
+
]
|
16 |
+
pil_images = load_pil_images(conversation)
|
17 |
+
prepare_inputs = vl_chat_processor(
|
18 |
+
conversations=conversation,
|
19 |
+
images=pil_images,
|
20 |
+
force_batchify=True
|
21 |
+
).to("cpu")
|
22 |
+
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
|
23 |
+
outputs = vl_gpt.language_model.generate(
|
24 |
+
inputs_embeds=inputs_embeds,
|
25 |
+
attention_mask=prepare_inputs.attention_mask,
|
26 |
+
pad_token_id=tokenizer.eos_token_id,
|
27 |
+
bos_token_id=tokenizer.bos_token_id,
|
28 |
+
eos_token_id=tokenizer.eos_token_id,
|
29 |
+
max_new_tokens=512,
|
30 |
+
do_sample=False,
|
31 |
+
use_cache=True
|
32 |
+
)
|
33 |
+
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
|
34 |
+
return answer
|
35 |
|
36 |
demo = gr.Interface(
|
37 |
fn=qa,
|
38 |
+
inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Enter your question")],
|
|
|
|
|
|
|
39 |
outputs="text",
|
40 |
title="DeepSeek-VL Multimodal QA Demo",
|
41 |
description="Upload an image and enter a question. Experience DeepSeek-VL's vision-language capabilities."
|
42 |
)
|
43 |
|
44 |
+
demo.launch()
|
|
requirements.txt
CHANGED
@@ -1,19 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
sentencepiece
|
6 |
-
attrdict
|
7 |
-
einops
|
8 |
-
|
9 |
-
# for gradio demo
|
10 |
-
gradio==3.48.0
|
11 |
-
gradio-client==0.6.1
|
12 |
-
mdtex2html==1.3.0
|
13 |
-
pypinyin==0.50.0
|
14 |
-
tiktoken==0.5.2
|
15 |
-
tqdm==4.64.0
|
16 |
-
colorama==0.4.5
|
17 |
-
Pygments==2.12.0
|
18 |
-
markdown==3.4.1
|
19 |
-
SentencePiece==0.1.96
|
|
|
1 |
+
gradio
|
2 |
+
torch
|
3 |
+
transformers
|
4 |
+
git+https://github.com/deepseek-ai/DeepSeek-VL.git
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|