KaiChen1998 commited on
Commit
9350a8c
Β·
1 Parent(s): e1b6b95

convert to HF backend

Browse files
Files changed (1) hide show
  1. app.py +8 -10
app.py CHANGED
@@ -13,7 +13,9 @@ auth_token = os.environ.get("TOKEN_FROM_SECRET")
13
  ##########################################
14
  # LLM part
15
  ##########################################
16
- from transformers import AutoProcessor, AutoTokenizer, TextIteratorStreamer
 
 
17
  from vllm import LLM, SamplingParams
18
  from qwen_vl_utils import process_vision_info
19
  from threading import Thread
@@ -32,14 +34,11 @@ LLM_MODEL_PATH = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
32
  processor = AutoProcessor.from_pretrained(MLLM_MODEL_PATH)
33
  tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_PATH)
34
 
35
- mllm = LLM(model=MLLM_MODEL_PATH, gpu_memory_utilization=0.8,
36
- device='cuda:0', dtype="bfloat16", limit_mm_per_prompt={"image": 1})
37
 
38
- llm = LLM(model=LLM_MODEL_PATH, tensor_parallel_size=1, gpu_memory_utilization=0.8,
39
- device='cuda:0', dtype="bfloat16")
40
-
41
- mllm_sampling = SamplingParams(temperature=0, max_tokens=8192)
42
- llm_sampling = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=8192)
43
 
44
  # === Build Prompts ===
45
  def build_messages(image_path, question):
@@ -171,8 +170,7 @@ title_markdown = ("""
171
  <h1 style="margin: 0;">RACRO: Perceptual Decoupling for Scalable Multi-modal Reasoning via Reward-Optimized Captioning</h1>
172
  <h2 style="margin: 10px 0;">πŸ“ƒ <a href="https://www.arxiv.org/abs/2506.04559" style="font-weight: 400;">Paper</a> | πŸ’» <a href="https://github.com/gyhdog99/RACRO2" style="font-weight: 400;">Code</a> | πŸ€— <a href="https://huggingface.co/collections/KaiChen1998/racro-6848ec8c65b3a0bf33d0fbdb" style="font-weight: 400;">HuggingFace</a></h2>
173
  <p style="margin: 20px 0;">
174
- <strong>1. RACRO is designed for multi-modal reasoning, and thus, image inputs are <mark>ALWAYS</mark> necessary!</strong><br/>
175
- <strong>2. Models are deployed with vLLM, which unfortunately, still does not support streaming outputs for MLLMs.</strong>
176
  </p>
177
  </div>
178
  </div>
 
13
  ##########################################
14
  # LLM part
15
  ##########################################
16
+ import torch
17
+ from transformers import AutoProcessor, AutoTokenizer
18
+ from transformers import Qwen2ForCausalLM, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
19
  from vllm import LLM, SamplingParams
20
  from qwen_vl_utils import process_vision_info
21
  from threading import Thread
 
34
  processor = AutoProcessor.from_pretrained(MLLM_MODEL_PATH)
35
  tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_PATH)
36
 
37
+ mllm = Qwen2_5_VLForConditionalGeneration.from_pretrained(MLLM_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
38
+ llm = Qwen2ForCausalLM.from_pretrained(LLM_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
39
 
40
+ mllm_sampling = dict(temperature=0, max_tokens=8192)
41
+ llm_sampling = dict(temperature=0.6, top_p=0.95, max_tokens=8192)
 
 
 
42
 
43
  # === Build Prompts ===
44
  def build_messages(image_path, question):
 
170
  <h1 style="margin: 0;">RACRO: Perceptual Decoupling for Scalable Multi-modal Reasoning via Reward-Optimized Captioning</h1>
171
  <h2 style="margin: 10px 0;">πŸ“ƒ <a href="https://www.arxiv.org/abs/2506.04559" style="font-weight: 400;">Paper</a> | πŸ’» <a href="https://github.com/gyhdog99/RACRO2" style="font-weight: 400;">Code</a> | πŸ€— <a href="https://huggingface.co/collections/KaiChen1998/racro-6848ec8c65b3a0bf33d0fbdb" style="font-weight: 400;">HuggingFace</a></h2>
172
  <p style="margin: 20px 0;">
173
+ <strong>1. RACRO is designed for multi-modal reasoning, and thus, image inputs are <mark>ALWAYS</mark> necessary!</strong>
 
174
  </p>
175
  </div>
176
  </div>