|
|
|
|
|
|
|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForVision2Seq |
|
|
|
import torch |
|
|
|
|
|
model_name = "ByteDance-Seed/UI-TARS-1.5-7B" |
|
|
|
model = AutoModelForVision2Seq.from_pretrained( |
|
model_name, |
|
device_map="auto", |
|
torch_dtype=torch.float16, |
|
quantization_config={ |
|
"load_in_4bit": True, |
|
"bnb_4bit_quant_type": "nf4", |
|
"bnb_4bit_compute_dtype": torch.float16, |
|
"bnb_4bit_use_double_quant": True, |
|
}, |
|
low_cpu_mem_usage=True |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
|
|
|
def greet(input): |
|
|
|
|
|
prompt = "Give me a short introduction to large language model." |
|
prompt = input |
|
messages = [{"role": "user", "content": prompt}] |
|
text = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True, |
|
enable_thinking=True, |
|
) |
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
|
|
|
generated_ids = model.generate(**model_inputs, max_new_tokens=32768) |
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist() |
|
|
|
|
|
try: |
|
|
|
index = len(output_ids) - output_ids[::-1].index(151668) |
|
except ValueError: |
|
index = 0 |
|
|
|
thinking_content = tokenizer.decode( |
|
output_ids[:index], skip_special_tokens=True |
|
).strip("\n") |
|
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") |
|
|
|
|
|
|
|
|
|
return "thinking content:" + thinking_content + "\n" + "content:" + content |
|
|
|
|
|
demo = gr.Interface(fn=greet, inputs="text", outputs="text") |
|
demo.launch() |
|
|