Spaces:

wolfofbackstreet
/

tiny-gguf-on-cpu

Sleeping

App Files Files Community

tiny-gguf-on-cpu / app.py

wolfofbackstreet

Update app.py

b1e40ab 3 months ago

raw

history blame

4.21 kB

	import inspect
	from typing import get_type_hints, Callable, Any
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# --- Load Model and Tokenizer ---
	model_id = "unsloth/SmolLM2-135M-Instruct-GGUF"
	filename = "SmolLM2-135M-Instruct-Q8_0.gguf"

	tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
	model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)

	# --- System Prompt Template ---
	SYSTEM_PROMPT = """You are a helpful AI assistant. Your job is to provide clear and concise responses based on the user's input.
	Keep your answers straightforward and avoid unnecessary information."""

	def parse_docstring(func):
	doc = inspect.getdoc(func)
	if not doc:
	return {"title": "Untitled", "description": ""}

	lines = doc.splitlines()
	title = next((line.replace("Title:", "").strip() for line in lines if line.startswith("Title:")), "Untitled")
	description = "\n".join(line.strip() for line in lines if line.startswith("Description:"))
	description = description.replace("Description:", "").strip()

	return {"title": title, "description": description}


	def gradio_app_with_docs(func: Callable) -> Callable:
	sig = inspect.signature(func)
	type_hints = get_type_hints(func)
	metadata = parse_docstring(func)

	"""
	A decorator that automatically builds and launches a Gradio interface
	based on function type hints.
	Args:
	func: A callable with type-hinted parameters and return type.
	Returns:
	The wrapped function with a `.launch()` method to start the app.
	"""

	def _map_type(t: type) -> gr.Component:
	if t == str:
	return gr.Textbox(label="Input")
	elif t == int:
	return gr.Number(precision=0)
	elif t == float:
	return gr.Number()
	elif t == bool:
	return gr.Checkbox()
	elif hasattr(t, "__origin__") and t.__origin__ == list:
	elem_type = t.__args__[0]
	if elem_type == str:
	return gr.Dropdown(choices=["Option1", "Option2"])
	else:
	raise ValueError(f"Unsupported list element type: {elem_type}")
	else:
	raise ValueError(f"Unsupported type: {t}")

	# Build inputs
	inputs = []
	for name, param in sig.parameters.items():
	if name == "self":
	continue
	param_type = type_hints.get(name, Any)
	component = _map_type(param_type)
	component.label = name.replace("_", " ").title()
	inputs.append(component)

	# Build outputs
	return_type = type_hints.get("return", Any)
	outputs = _map_type(return_type)

	# Wrap function with Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown(f"## {metadata['title']}\n{metadata['description']}")
	gr.Interface(fn=func, inputs=inputs, outputs=outputs)

	def wrapper(args, *kwargs):
	return func(args, *kwargs)

	wrapper.launch = lambda: demo.launch()
	return wrapper


	@gradio_app_with_docs
	def generate_response(prompt: str) -> str:
	"""
	Title: Super Tiny GGUF Model on CPU
	Description: A Simple app to test out the potentials of small GGUF LLM model.
	Args:
	prompt (str): A simple prompt.
	Returns:
	str: Simplified response.
	"""
	# Apply system prompt + user input
	# full_prompt = f"<\|begin_of_text\|>System: {SYSTEM_PROMPT}\nUser: {prompt}\nAssistant:"

	# inputs = tokenizer(full_prompt, return_tensors="pt").to("cpu")

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt}
	]

	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
	)

	inputs = tokenizer([text], return_tensors="pt").to(model.device)

	outputs = model.generate(
	**inputs,
	max_new_tokens=100,
	# temperature=0.7,
	# top_p=0.9
	)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)


	if __name__ == "__main__":
	generate_response.launch()