Spaces:
Running
on
Zero
Running
on
Zero
Commit
Β·
9350a8c
1
Parent(s):
e1b6b95
convert to HF backend
Browse files
app.py
CHANGED
@@ -13,7 +13,9 @@ auth_token = os.environ.get("TOKEN_FROM_SECRET")
|
|
13 |
##########################################
|
14 |
# LLM part
|
15 |
##########################################
|
16 |
-
|
|
|
|
|
17 |
from vllm import LLM, SamplingParams
|
18 |
from qwen_vl_utils import process_vision_info
|
19 |
from threading import Thread
|
@@ -32,14 +34,11 @@ LLM_MODEL_PATH = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
|
|
32 |
processor = AutoProcessor.from_pretrained(MLLM_MODEL_PATH)
|
33 |
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_PATH)
|
34 |
|
35 |
-
mllm =
|
36 |
-
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
mllm_sampling = SamplingParams(temperature=0, max_tokens=8192)
|
42 |
-
llm_sampling = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=8192)
|
43 |
|
44 |
# === Build Prompts ===
|
45 |
def build_messages(image_path, question):
|
@@ -171,8 +170,7 @@ title_markdown = ("""
|
|
171 |
<h1 style="margin: 0;">RACRO: Perceptual Decoupling for Scalable Multi-modal Reasoning via Reward-Optimized Captioning</h1>
|
172 |
<h2 style="margin: 10px 0;">π <a href="https://www.arxiv.org/abs/2506.04559" style="font-weight: 400;">Paper</a> | π» <a href="https://github.com/gyhdog99/RACRO2" style="font-weight: 400;">Code</a> | π€ <a href="https://huggingface.co/collections/KaiChen1998/racro-6848ec8c65b3a0bf33d0fbdb" style="font-weight: 400;">HuggingFace</a></h2>
|
173 |
<p style="margin: 20px 0;">
|
174 |
-
<strong>1. RACRO is designed for multi-modal reasoning, and thus, image inputs are <mark>ALWAYS</mark> necessary!</strong
|
175 |
-
<strong>2. Models are deployed with vLLM, which unfortunately, still does not support streaming outputs for MLLMs.</strong>
|
176 |
</p>
|
177 |
</div>
|
178 |
</div>
|
|
|
13 |
##########################################
|
14 |
# LLM part
|
15 |
##########################################
|
16 |
+
import torch
|
17 |
+
from transformers import AutoProcessor, AutoTokenizer
|
18 |
+
from transformers import Qwen2ForCausalLM, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
|
19 |
from vllm import LLM, SamplingParams
|
20 |
from qwen_vl_utils import process_vision_info
|
21 |
from threading import Thread
|
|
|
34 |
processor = AutoProcessor.from_pretrained(MLLM_MODEL_PATH)
|
35 |
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_PATH)
|
36 |
|
37 |
+
mllm = Qwen2_5_VLForConditionalGeneration.from_pretrained(MLLM_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
|
38 |
+
llm = Qwen2ForCausalLM.from_pretrained(LLM_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
|
39 |
|
40 |
+
mllm_sampling = dict(temperature=0, max_tokens=8192)
|
41 |
+
llm_sampling = dict(temperature=0.6, top_p=0.95, max_tokens=8192)
|
|
|
|
|
|
|
42 |
|
43 |
# === Build Prompts ===
|
44 |
def build_messages(image_path, question):
|
|
|
170 |
<h1 style="margin: 0;">RACRO: Perceptual Decoupling for Scalable Multi-modal Reasoning via Reward-Optimized Captioning</h1>
|
171 |
<h2 style="margin: 10px 0;">π <a href="https://www.arxiv.org/abs/2506.04559" style="font-weight: 400;">Paper</a> | π» <a href="https://github.com/gyhdog99/RACRO2" style="font-weight: 400;">Code</a> | π€ <a href="https://huggingface.co/collections/KaiChen1998/racro-6848ec8c65b3a0bf33d0fbdb" style="font-weight: 400;">HuggingFace</a></h2>
|
172 |
<p style="margin: 20px 0;">
|
173 |
+
<strong>1. RACRO is designed for multi-modal reasoning, and thus, image inputs are <mark>ALWAYS</mark> necessary!</strong>
|
|
|
174 |
</p>
|
175 |
</div>
|
176 |
</div>
|