Spaces:

Didier
/

Qwen3_chat_think_translate

Running on Zero

File size: 3,552 Bytes

"""
File: llm.py
Description: Large language model utility functions.
Author: Didier Guillevic
Date: 2025-05-03
"""

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextIteratorStreamer
import threading
import torch
import spaces

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

#
# Load the model: "Qwen/Qwen3-4B"
#
model_id = "Qwen/Qwen3-4B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto"
)
model = torch.compile(model)
model.eval() # inference mode

# Get end of thinking response token (used to split the response)
end_think_token_id = tokenizer.convert_tokens_to_ids("</think>")

# Output information about the model
def model_info(model):
    # Number of parameters
    total_params = sum(p.numel() for p in model.parameters())

    # Estimated memory usage (in GB)
    param_count = sum(p.numel() for p in model.parameters())
    param_size = param_count * model.dtype.itemsize  # in bytes

    return {
        "dtype": model.dtype,
        "device": model.device,
        "nb_parameters": f"{total_params / 1e6:.2f} M",
        "size": f"{param_size / 1024**3:.2f} GB"
    }

logger.info(f"{model_info(model)}")

#
# Build (text) messages
#
def build_messages(
        message: str,
        history: list[dict]
    ) -> list[dict]:
    """Build messages given message & history from a **text** chat interface.

    Args:
        message: user input
        history: list of dictionaries (with user & assistant messages)
    
    Returns:
        list of messages (to be sent to the model)
    """
    messages = history
    # Add whether the model should think before responding
    # (note that thinking is enabled by default, so we could omit ' /think')
    messages.append({
        'role': 'user',
        'content': message
        #'content': message + (' /think' if thinking else ' /no_think')
    })

    return messages


#
# Stream response
#
@spaces.GPU
@torch.inference_mode()
def stream_response(
        messages: list[dict],
        enable_thinking: bool=False,
        max_new_tokens: int=1_024
    ) -> tuple[str, str]:
    """Stream the model's response to the chat interface.
    
    Args:
        messages: list of messages (to be sent to the model)
        thinking: boolean indicating whether the model should think before responding
    
    Returns:
        tuple of (thinking_response, final_response)
    """
    # apply chat template and get model's inputs
    model_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=enable_thinking
    )
    model_inputs = tokenizer(
        [model_prompt,],
        return_tensors="pt"
    ).to(model.device)

    # get the model's response
    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        top_k=20,
        repetition_penalty=1.5,
        min_p=0.0,
        use_cache=True,
    )
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    thinking_response = ""
    final_response = ""
    is_final_response = False

    for text in streamer:
        final_response += text
        yield final_response