Didier's picture
Update llm.py
6108324 verified
"""
File: llm.py
Description: Large language model utility functions.
Author: Didier Guillevic
Date: 2025-05-03
"""
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextIteratorStreamer
import threading
import torch
import spaces
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
#
# Load the model: "Qwen/Qwen3-4B"
#
model_id = "Qwen/Qwen3-4B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
device_map="auto"
)
model = torch.compile(model)
model.eval() # inference mode
# Get end of thinking response token (used to split the response)
end_think_token_id = tokenizer.convert_tokens_to_ids("</think>")
# Output information about the model
def model_info(model):
# Number of parameters
total_params = sum(p.numel() for p in model.parameters())
# Estimated memory usage (in GB)
param_count = sum(p.numel() for p in model.parameters())
param_size = param_count * model.dtype.itemsize # in bytes
return {
"dtype": model.dtype,
"device": model.device,
"nb_parameters": f"{total_params / 1e6:.2f} M",
"size": f"{param_size / 1024**3:.2f} GB"
}
logger.info(f"{model_info(model)}")
#
# Build (text) messages
#
def build_messages(
message: str,
history: list[dict]
) -> list[dict]:
"""Build messages given message & history from a **text** chat interface.
Args:
message: user input
history: list of dictionaries (with user & assistant messages)
Returns:
list of messages (to be sent to the model)
"""
messages = history
# Add whether the model should think before responding
# (note that thinking is enabled by default, so we could omit ' /think')
messages.append({
'role': 'user',
'content': message
#'content': message + (' /think' if thinking else ' /no_think')
})
return messages
#
# Stream response
#
@spaces.GPU
@torch.inference_mode()
def stream_response(
messages: list[dict],
enable_thinking: bool=False,
max_new_tokens: int=1_024
) -> tuple[str, str]:
"""Stream the model's response to the chat interface.
Args:
messages: list of messages (to be sent to the model)
thinking: boolean indicating whether the model should think before responding
Returns:
tuple of (thinking_response, final_response)
"""
# apply chat template and get model's inputs
model_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=enable_thinking
)
model_inputs = tokenizer(
[model_prompt,],
return_tensors="pt"
).to(model.device)
# get the model's response
streamer = TextIteratorStreamer(
tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.6,
top_p=0.95,
top_k=20,
repetition_penalty=1.5,
min_p=0.0,
use_cache=True,
)
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
thinking_response = ""
final_response = ""
is_final_response = False
for text in streamer:
final_response += text
yield final_response