Spaces:

david-thrower
/

3B-Param-Basic-Chatbot

Build error

App Files Files Community

3B-Param-Basic-Chatbot / app.py

david-thrower

Update app.py

6436528 verified about 6 hours ago

raw

history blame contribute delete

11.5 kB


	# import gc

	# import gradio as gr
	# import torch
	# from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig

	# # # quant_config = HqqConfig(nbits=8, group_size=64)

	# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
	# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# print("Loading tokenizer & model…")
	# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	# # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)

	# model =\
	# AutoModelForCausalLM\
	# .from_pretrained(
	# MODEL_ID,
	# torch_dtype=torch.float16,
	# # device_map="cuda",
	# # quantization_config=quant_config
	# ).to(DEVICE)

	# gc.collect()

	#########

	# import torch
	# from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
	# from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig

	# # quant_config = Float8WeightOnlyConfig()
	# quant_config = Float8DynamicActivationFloat8WeightConfig()
	# quantization_config = TorchAoConfig(quant_type=quant_config)

	# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"

	# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	# model = AutoModelForCausalLM.from_pretrained(
	# MODEL_ID,
	# torch_dtype="auto",
	# device_map="auto",
	# quantization_config=quantization_config)

	# gc.collect()


	#########

	# from unsloth import FastLanguageModel

	# model, tokenizer = FastLanguageModel.from_pretrained(
	# "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
	# max_seq_length=128_000,
	# load_in_4bit=True
	# )

	#########

	# import gc

	# import gradio as gr
	# from transformers import AutoTokenizer
	# from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
	# from optimum.onnxruntime.configuration import AutoQuantizationConfig

	# MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"



	# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	# model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True)

	# print("Creating quant config")
	# qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
	# print("Creating quant config successful")

	# print("Creating quantizer")
	# quantizer = ORTQuantizer.from_pretrained(model)
	# print("Creating quantizer successful")
	# # Step 4: Perform quantization saving output in a new directory
	# quantized_model_dir = "./quantized_model"
	# print("Starting quantization...")
	# quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig)
	# print("Quantization was successful. Garbage collecting...")

	# del(quantizer)
	# del(qconfig)
	# del(model)

	# Run garbage collection again to release memory from quantizer objects
	# gc.collect()

	# # Step 5: Load the quantized ONNX model for inference
	# print("Loading quantized ONNX model for inference...")
	# model = ORTModelForCausalLM.from_pretrained(quantized_model_dir)
	# print("Loading model was succcessful. Garbage collecting.")

	# Garbage collection again after final loading
	# gc.collect()

	#########

	# print("Loading tokenizer & model…")
	# import gradio as gr
	# from transformers import AutoTokenizer
	# from optimum.onnxruntime import ORTModelForCausalLM

	# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
	# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	# model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True)

	#########


	# -------------------------------------------------
	# Optional tool(s)
	# -------------------------------------------------
	# TOOLS = [{
	# "name": "get_weather",
	# "description": "Get the current weather in a given city",
	# "parameters": {
	# "type": "object",
	# "properties": {
	# "city": {"type": "string", "description": "City name"}
	# },
	# "required": ["city"]
	# }
	# }]

	# -------------------------------------------------
	# Helpers
	# -------------------------------------------------

	# def build_messages(history, enable_thinking: bool):
	# """Convert Gradio history to the chat template."""
	# messages = []
	# for h in history:
	# messages.append({"role": h["role"], "content": h["content"]})
	# # Add system instruction for mode
	# system_flag = "/think" if enable_thinking else "/no_think"
	# messages.insert(0, {"role": "system", "content": system_flag})
	# return messages

	# def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens):
	# """Generate a streaming response."""
	# messages = build_messages(history, enable_thinking)
	# text = tokenizer.apply_chat_template(
	# messages,
	# tokenize=False,
	# add_generation_prompt=True,
	# # xml_tools=TOOLS
	# )
	# inputs = tokenizer(text, return_tensors="pt")
	# gc.collect()
	# with torch.inference_mode():
	# streamer = model.generate(
	# **inputs,
	# max_new_tokens=max_new_tokens,
	# do_sample=True,
	# temperature=temperature,
	# top_p=top_p,
	# top_k=top_k,
	# repetition_penalty=repetition_penalty,
	# pad_token_id=tokenizer.eos_token_id,
	# streamer=None # we'll yield manually
	# )
	# gc.collect()
	# output_ids = streamer[0][len(inputs.input_ids[0]):]
	# response = tokenizer.decode(output_ids, skip_special_tokens=True)
	# if isinstance(response, str):
	# response = response.replace('<think>',"# <think>").replace('</think>',"</think>")
	# elif isinstance(response,list):
	# response = [paper.replace('<think>',"# <think>").replace('</think>',"</think>") for paper in response]
	# else:
	# raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!")

	# # streaming char-by-char
	# history.append({"role": "assistant", "content": ""})
	# for ch in response:
	# history[-1]["content"] += ch
	# yield history

	# # -------------------------------------------------
	# # Blocks UI
	# # -------------------------------------------------
	# with gr.Blocks(title="SmolLM3-3B Chat") as demo:
	# gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
	# with gr.Row():
	# enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
	# temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
	# top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
	# top_k = gr.Slider(1,40,value=20,label="Top_k")
	# repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty")
	# max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens")
	# chatbot = gr.Chatbot(type="messages")
	# msg = gr.Textbox(placeholder="Type your message here…", lines=1)
	# clear = gr.Button("Clear")

	# def user_fn(user_msg, history):
	# return "", history + [{"role": "user", "content": user_msg}]

	# msg.submit(
	# user_fn, [msg, chatbot], [msg, chatbot], queue=False
	# ).then(
	# chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot
	# )
	# clear.click(lambda: None, None, chatbot, queue=False)

	# demo.queue().launch()

	import gc
	from pathlib import Path
	from llama_cpp import Llama
	import gradio as gr

	from pypdf import PdfReader
	import pandas as pd
	from docx import Document

	MAX_TOKENS = 10_000

	llm = Llama.from_pretrained(
	repo_id="unsloth/SmolLM3-3B-GGUF",
	filename="SmolLM3-3B-Q4_K_M.gguf",
	n_ctx=MAX_TOKENS,
	)
	gc.collect()

	# ---------- helpers ----------


	def read_file(p: Path) -> str:
	try:
	suffix = p.suffix.lower()
	if suffix == ".pdf":
	with p.open("rb") as f:
	reader = PdfReader(f)
	return "\n".join(page.extract_text() or "" for page in reader.pages)
	elif suffix in (".xlsx", ".xls"):
	sheets = pd.read_excel(p, sheet_name=None)
	text = ""
	for sheet_name, df in sheets.items():
	text += df.to_string()
	return text
	elif suffix == ".docx":
	with p.open("rb") as f:
	doc = Document(f)
	return "\n".join(para.text for para in doc.paragraphs)
	else:
	return p.read_text(encoding="utf-8", errors="ignore")
	except Exception:
	return "[could not read file]"



	def build_messages(history, enable_thinking: bool):
	messages = []
	for h in history:
	messages.append({"role": h["role"], "content": h["content"]})
	system_flag = "/think" if enable_thinking else "/no_think"
	messages.insert(0, {"role": "system", "content": system_flag})
	return messages

	def chat_fn(history, enable_thinking, temperature, top_p, top_k,
	repetition_penalty, max_new_tokens):
	messages = build_messages(history, enable_thinking)

	response = llm.create_chat_completion(
	messages=messages,
	max_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repeat_penalty=repetition_penalty
	)
	response_text = response['choices'][0]['message']['content']
	if isinstance(response_text, str):
	response = response_text.replace('<think>', "# <think>").replace('</think>', "</think>")
	elif isinstance(response_text, list):
	response = [t.replace('<think>', "# <think>").replace('</think>', "</think>") for t in response_text]
	else:
	raise ValueError("Malformed response from tokenizer")

	history.append({"role": "assistant", "content": ""})
	for ch in response:
	history[-1]["content"] += ch
	yield history

	# ---------- UI ----------
	with gr.Blocks(title="SmolLM3-3B Chat") as demo:
	gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
	with gr.Row():
	enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
	temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
	top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
	top_k = gr.Slider(1, 40, value=20, label="Top-k")
	repetition_penalty = gr.Slider(1.0, 1.4, value=1.1, label="Repetition Penalty")
	max_new_tokens = gr.Slider(1000, MAX_TOKENS, value=MAX_TOKENS, label="Max New Tokens")

	chatbot = gr.Chatbot(type="messages")
	with gr.Row():
	msg = gr.Textbox(placeholder="Type your message here…", lines=1, scale=8)
	send_btn = gr.Button("Send", scale=1)
	file_uploader = gr.File(label="Attach file(s)", file_count="multiple", file_types=None)

	clear = gr.Button("Clear")

	def user_fn(user_msg, history, files):
	if files:
	file_contents = "\n\n".join(read_file(Path(fp)) for fp in files)
	user_msg += f"\n\n# FILE CONTENT:\n\n{file_contents}"
	return "", history + [{"role": "user", "content": user_msg}], None # clear file_uploader

	# Submit on button click or Enter key
	for trigger in (msg.submit, send_btn.click):
	trigger(
	user_fn, [msg, chatbot, file_uploader], [msg, chatbot, file_uploader], queue=False
	).then(
	chat_fn,
	[chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens],
	chatbot
	)

	clear.click(lambda: None, None, chatbot, queue=False)

	demo.queue().launch()