Spaces:

david-thrower
/

3B-Param-Basic-Chatbot

Build error

File size: 11,470 Bytes


# import gc

# import gradio as gr
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig

# # # quant_config = HqqConfig(nbits=8, group_size=64)

# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# print("Loading tokenizer & model…")
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)

# model =\
#         AutoModelForCausalLM\
#                 .from_pretrained(
#                         MODEL_ID, 
#                         torch_dtype=torch.float16, 
#                         # device_map="cuda", 
#                         # quantization_config=quant_config
#         ).to(DEVICE)

# gc.collect()

#########

# import torch
# from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
# from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig

# # quant_config = Float8WeightOnlyConfig()
# quant_config = Float8DynamicActivationFloat8WeightConfig()
# quantization_config = TorchAoConfig(quant_type=quant_config)

# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"

# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     torch_dtype="auto",
#     device_map="auto",
#     quantization_config=quantization_config)

# gc.collect()


#########

# from unsloth import FastLanguageModel

# model, tokenizer = FastLanguageModel.from_pretrained(
#     "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
#     max_seq_length=128_000,
#     load_in_4bit=True
# )

#########

# import gc

# import gradio as gr
# from transformers import AutoTokenizer
# from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
# from optimum.onnxruntime.configuration import AutoQuantizationConfig

# MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"



# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True)

# print("Creating quant config")
# qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
# print("Creating quant config successful")

# print("Creating quantizer")
# quantizer = ORTQuantizer.from_pretrained(model)
# print("Creating quantizer successful")
# # Step 4: Perform quantization saving output in a new directory
# quantized_model_dir = "./quantized_model"
# print("Starting quantization...")
# quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig)
# print("Quantization was successful. Garbage collecting...")

# del(quantizer)
# del(qconfig)
# del(model)

# Run garbage collection again to release memory from quantizer objects
# gc.collect()

# # Step 5: Load the quantized ONNX model for inference
# print("Loading quantized ONNX model for inference...")
# model = ORTModelForCausalLM.from_pretrained(quantized_model_dir)
# print("Loading model was succcessful. Garbage collecting.")

# Garbage collection again after final loading
# gc.collect()

#########

# print("Loading tokenizer & model…")
# import gradio as gr
# from transformers import AutoTokenizer
# from optimum.onnxruntime import ORTModelForCausalLM

# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True)

#########


# -------------------------------------------------
# Optional tool(s)
# -------------------------------------------------
# TOOLS = [{
#     "name": "get_weather",
#     "description": "Get the current weather in a given city",
#     "parameters": {
#         "type": "object",
#         "properties": {
#             "city": {"type": "string", "description": "City name"}
#         },
#         "required": ["city"]
#     }
# }]

# -------------------------------------------------
# Helpers
# -------------------------------------------------

# def build_messages(history, enable_thinking: bool):
#     """Convert Gradio history to the chat template."""
#     messages = []
#     for h in history:
#         messages.append({"role": h["role"], "content": h["content"]})
#     # Add system instruction for mode
#     system_flag = "/think" if enable_thinking else "/no_think"
#     messages.insert(0, {"role": "system", "content": system_flag})
#     return messages

# def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens):
#     """Generate a streaming response."""
#     messages = build_messages(history, enable_thinking)
#     text = tokenizer.apply_chat_template(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True,
#         # xml_tools=TOOLS
#     )
#     inputs = tokenizer(text, return_tensors="pt")
#     gc.collect()
#     with torch.inference_mode():
#         streamer = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             do_sample=True,
#             temperature=temperature,
#             top_p=top_p,
#             top_k=top_k,
#             repetition_penalty=repetition_penalty,
#             pad_token_id=tokenizer.eos_token_id,
#             streamer=None          # we'll yield manually
#         )
#     gc.collect()
#     output_ids = streamer[0][len(inputs.input_ids[0]):]
#     response = tokenizer.decode(output_ids, skip_special_tokens=True)
#     if isinstance(response, str): 
#         response = response.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;")
#     elif isinstance(response,list):
#         response = [paper.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;") for paper in response]
#     else:
#         raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!")

#     # streaming char-by-char
#     history.append({"role": "assistant", "content": ""})
#     for ch in response:
#         history[-1]["content"] += ch
#         yield history

# # -------------------------------------------------
# # Blocks UI
# # -------------------------------------------------
# with gr.Blocks(title="SmolLM3-3B Chat") as demo:
#     gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
#     with gr.Row():
#         enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
#         temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
#         top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
#         top_k = gr.Slider(1,40,value=20,label="Top_k")
#         repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty")
#         max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens")
#     chatbot = gr.Chatbot(type="messages")
#     msg = gr.Textbox(placeholder="Type your message here…", lines=1)
#     clear = gr.Button("Clear")

#     def user_fn(user_msg, history):
#         return "", history + [{"role": "user", "content": user_msg}]

#     msg.submit(
#         user_fn, [msg, chatbot], [msg, chatbot], queue=False
#     ).then(
#         chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot
#     )
#     clear.click(lambda: None, None, chatbot, queue=False)

# demo.queue().launch()

import gc
from pathlib import Path
from llama_cpp import Llama
import gradio as gr

from pypdf import PdfReader
import pandas as pd
from docx import Document

MAX_TOKENS = 10_000

llm = Llama.from_pretrained(
    repo_id="unsloth/SmolLM3-3B-GGUF",
    filename="SmolLM3-3B-Q4_K_M.gguf",
    n_ctx=MAX_TOKENS,
)
gc.collect()

# ---------- helpers ----------


def read_file(p: Path) -> str:
    try:
        suffix = p.suffix.lower()
        if suffix == ".pdf":
            with p.open("rb") as f:
                reader = PdfReader(f)
                return "\n".join(page.extract_text() or "" for page in reader.pages)
        elif suffix in (".xlsx", ".xls"):
            sheets = pd.read_excel(p, sheet_name=None)
            text = ""
            for sheet_name, df in sheets.items():
                text += df.to_string()
            return text
        elif suffix == ".docx":
            with p.open("rb") as f:
                doc = Document(f)
                return "\n".join(para.text for para in doc.paragraphs)
        else:
            return p.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return "[could not read file]"



def build_messages(history, enable_thinking: bool):
    messages = []
    for h in history:
        messages.append({"role": h["role"], "content": h["content"]})
    system_flag = "/think" if enable_thinking else "/no_think"
    messages.insert(0, {"role": "system", "content": system_flag})
    return messages

def chat_fn(history, enable_thinking, temperature, top_p, top_k,
            repetition_penalty, max_new_tokens):
    messages = build_messages(history, enable_thinking)

    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repeat_penalty=repetition_penalty
    )
    response_text = response['choices'][0]['message']['content']
    if isinstance(response_text, str):
        response = response_text.replace('<think>', "# &lt;think&gt;").replace('</think>', "&lt;/think&gt;")
    elif isinstance(response_text, list):
        response = [t.replace('<think>', "# &lt;think&gt;").replace('</think>', "&lt;/think&gt;") for t in response_text]
    else:
        raise ValueError("Malformed response from tokenizer")

    history.append({"role": "assistant", "content": ""})
    for ch in response:
        history[-1]["content"] += ch
        yield history

# ---------- UI ----------
with gr.Blocks(title="SmolLM3-3B Chat") as demo:
    gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
    with gr.Row():
        enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
        temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
        top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
        top_k = gr.Slider(1, 40, value=20, label="Top-k")
        repetition_penalty = gr.Slider(1.0, 1.4, value=1.1, label="Repetition Penalty")
        max_new_tokens = gr.Slider(1000, MAX_TOKENS, value=MAX_TOKENS, label="Max New Tokens")

    chatbot = gr.Chatbot(type="messages")
    with gr.Row():
        msg = gr.Textbox(placeholder="Type your message here…", lines=1, scale=8)
        send_btn = gr.Button("Send", scale=1)
    file_uploader = gr.File(label="Attach file(s)", file_count="multiple", file_types=None)

    clear = gr.Button("Clear")

    def user_fn(user_msg, history, files):
        if files:
            file_contents = "\n\n".join(read_file(Path(fp)) for fp in files)
            user_msg += f"\n\n# FILE CONTENT:\n\n{file_contents}"
        return "", history + [{"role": "user", "content": user_msg}], None  # clear file_uploader

    # Submit on button click or Enter key
    for trigger in (msg.submit, send_btn.click):
        trigger(
            user_fn, [msg, chatbot, file_uploader], [msg, chatbot, file_uploader], queue=False
        ).then(
            chat_fn,
            [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens],
            chatbot
        )

    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue().launch()