Spaces:

Didier
/

Qwen3_chat_think_translate

Running on Zero

App Files Files Community

Qwen3_chat_think_translate / llm.py

Didier

Update llm.py

6108324 verified 3 months ago

raw

history blame contribute delete

3.55 kB

	"""
	File: llm.py
	Description: Large language model utility functions.
	Author: Didier Guillevic
	Date: 2025-05-03
	"""

	from transformers import AutoTokenizer, AutoModelForCausalLM
	from transformers import TextIteratorStreamer
	import threading
	import torch
	import spaces

	import logging
	logger = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO)

	#
	# Load the model: "Qwen/Qwen3-4B"
	#
	model_id = "Qwen/Qwen3-4B"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype="auto",
	device_map="auto"
	)
	model = torch.compile(model)
	model.eval() # inference mode

	# Get end of thinking response token (used to split the response)
	end_think_token_id = tokenizer.convert_tokens_to_ids("</think>")

	# Output information about the model
	def model_info(model):
	# Number of parameters
	total_params = sum(p.numel() for p in model.parameters())

	# Estimated memory usage (in GB)
	param_count = sum(p.numel() for p in model.parameters())
	param_size = param_count * model.dtype.itemsize # in bytes

	return {
	"dtype": model.dtype,
	"device": model.device,
	"nb_parameters": f"{total_params / 1e6:.2f} M",
	"size": f"{param_size / 1024**3:.2f} GB"
	}

	logger.info(f"{model_info(model)}")

	#
	# Build (text) messages
	#
	def build_messages(
	message: str,
	history: list[dict]
	) -> list[dict]:
	"""Build messages given message & history from a text chat interface.

	Args:
	message: user input
	history: list of dictionaries (with user & assistant messages)

	Returns:
	list of messages (to be sent to the model)
	"""
	messages = history
	# Add whether the model should think before responding
	# (note that thinking is enabled by default, so we could omit ' /think')
	messages.append({
	'role': 'user',
	'content': message
	#'content': message + (' /think' if thinking else ' /no_think')
	})

	return messages


	#
	# Stream response
	#
	@spaces.GPU
	@torch.inference_mode()
	def stream_response(
	messages: list[dict],
	enable_thinking: bool=False,
	max_new_tokens: int=1_024
	) -> tuple[str, str]:
	"""Stream the model's response to the chat interface.

	Args:
	messages: list of messages (to be sent to the model)
	thinking: boolean indicating whether the model should think before responding

	Returns:
	tuple of (thinking_response, final_response)
	"""
	# apply chat template and get model's inputs
	model_prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=enable_thinking
	)
	model_inputs = tokenizer(
	[model_prompt,],
	return_tensors="pt"
	).to(model.device)

	# get the model's response
	streamer = TextIteratorStreamer(
	tokenizer, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=0.6,
	top_p=0.95,
	top_k=20,
	repetition_penalty=1.5,
	min_p=0.0,
	use_cache=True,
	)
	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	thinking_response = ""
	final_response = ""
	is_final_response = False

	for text in streamer:
	final_response += text
	yield final_response