Spaces:
Running
on
Zero
Running
on
Zero
""" | |
File: llm.py | |
Description: Large language model utility functions. | |
Author: Didier Guillevic | |
Date: 2025-05-03 | |
""" | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from transformers import TextIteratorStreamer | |
import threading | |
import torch | |
import spaces | |
import logging | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
# | |
# Load the model: "Qwen/Qwen3-4B" | |
# | |
model_id = "Qwen/Qwen3-4B" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
torch_dtype="auto", | |
device_map="auto" | |
) | |
model = torch.compile(model) | |
model.eval() # inference mode | |
# Get end of thinking response token (used to split the response) | |
end_think_token_id = tokenizer.convert_tokens_to_ids("</think>") | |
# Output information about the model | |
def model_info(model): | |
# Number of parameters | |
total_params = sum(p.numel() for p in model.parameters()) | |
# Estimated memory usage (in GB) | |
param_count = sum(p.numel() for p in model.parameters()) | |
param_size = param_count * model.dtype.itemsize # in bytes | |
return { | |
"dtype": model.dtype, | |
"device": model.device, | |
"nb_parameters": f"{total_params / 1e6:.2f} M", | |
"size": f"{param_size / 1024**3:.2f} GB" | |
} | |
logger.info(f"{model_info(model)}") | |
# | |
# Build (text) messages | |
# | |
def build_messages( | |
message: str, | |
history: list[dict] | |
) -> list[dict]: | |
"""Build messages given message & history from a **text** chat interface. | |
Args: | |
message: user input | |
history: list of dictionaries (with user & assistant messages) | |
Returns: | |
list of messages (to be sent to the model) | |
""" | |
messages = history | |
# Add whether the model should think before responding | |
# (note that thinking is enabled by default, so we could omit ' /think') | |
messages.append({ | |
'role': 'user', | |
'content': message | |
#'content': message + (' /think' if thinking else ' /no_think') | |
}) | |
return messages | |
# | |
# Stream response | |
# | |
def stream_response( | |
messages: list[dict], | |
enable_thinking: bool=False, | |
max_new_tokens: int=1_024 | |
) -> tuple[str, str]: | |
"""Stream the model's response to the chat interface. | |
Args: | |
messages: list of messages (to be sent to the model) | |
thinking: boolean indicating whether the model should think before responding | |
Returns: | |
tuple of (thinking_response, final_response) | |
""" | |
# apply chat template and get model's inputs | |
model_prompt = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True, | |
enable_thinking=enable_thinking | |
) | |
model_inputs = tokenizer( | |
[model_prompt,], | |
return_tensors="pt" | |
).to(model.device) | |
# get the model's response | |
streamer = TextIteratorStreamer( | |
tokenizer, skip_prompt=True, skip_special_tokens=True) | |
generation_kwargs = dict( | |
model_inputs, | |
streamer=streamer, | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
temperature=0.6, | |
top_p=0.95, | |
top_k=20, | |
repetition_penalty=1.5, | |
min_p=0.0, | |
use_cache=True, | |
) | |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
thread.start() | |
thinking_response = "" | |
final_response = "" | |
is_final_response = False | |
for text in streamer: | |
final_response += text | |
yield final_response |