Spaces:

ThomasBasil
/

slm-customer-support-chatbot

Paused

App Files Files Community

BasilTh commited on Aug 8

Commit

ae152d5

1 Parent(s): 8eb6be4

Deploy updated SLM customer-support chatbot

Browse files

Files changed (2) hide show

SLM_CService.py +14 -15
app.py +2 -3

SLM_CService.py CHANGED Viewed

@@ -10,7 +10,7 @@ os.environ["OMP_NUM_THREADS"] = "1"
 os.environ.pop("HF_HUB_OFFLINE", None)
 # 1) Import Unsloth BEFORE transformers/peft so its patches apply
-import unsloth  # must come first (Unsloth docs recommend this)
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from peft import PeftModel
@@ -21,8 +21,16 @@ from langchain.memory import ConversationBufferMemory
 REPO = "ThomasBasil/bitext-qlora-tinyllama"         # adapter + tokenizer at ROOT
 BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"         # base model
 # 4-bit NF4 for QLoRA-style loading (needs GPU)
-# Using float16 here for broader GPU compatibility on Spaces (T4/A10G).
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -38,17 +46,14 @@ tokenizer.padding_side = "left"
 tokenizer.truncation_side = "right"
 # ---- Base model (4-bit) via Unsloth -----------------------------------------
-# IMPORTANT: Unsloth returns (model, tokenizer). UNPACK IT.
-# Ref: Unsloth wiki/examples show `model, tokenizer = FastLanguageModel.from_pretrained(...)`
-# We'll ignore Unsloth's tokenizer since we already loaded the one from your repo.
 model, _ = unsloth.FastLanguageModel.from_pretrained(
     model_name=BASE,
     load_in_4bit=True,
-    quantization_config=bnb_cfg,   # preferred modern arg
     device_map="auto",
     trust_remote_code=True,
 )
-# Optional speed patch:
 unsloth.FastLanguageModel.for_inference(model)
 # ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
@@ -56,19 +61,13 @@ unsloth.FastLanguageModel.for_inference(model)
 model = PeftModel.from_pretrained(model, REPO)
 model.eval()
-# ---- Text-generation pipeline (use generate_kwargs) --------------------------
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
-    generate_kwargs={
-        "max_new_tokens": 128,
-        "do_sample": True,
-        "top_p": 0.9,
-        "temperature": 0.7,
-    },
 )
 # ──────────────────────────────────────────────────────────────────────────────
@@ -163,7 +162,7 @@ def chat_with_memory(user_input: str) -> str:
     # E) fallback → generate with chat history context
     prompt = _history_to_prompt(ui)
-    out = chat_pipe(prompt)[0]["generated_text"]
     reply = out.split("Assistant:")[-1].strip()
     memory.save_context({"input": ui}, {"output": reply})
     return reply

 os.environ.pop("HF_HUB_OFFLINE", None)
 # 1) Import Unsloth BEFORE transformers/peft so its patches apply
+import unsloth  # must come first
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from peft import PeftModel
 REPO = "ThomasBasil/bitext-qlora-tinyllama"         # adapter + tokenizer at ROOT
 BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"         # base model
+# Generation params (PASS THESE AT CALL TIME — not as generate_kwargs)
+GEN_KW = {
+    "max_new_tokens": 128,
+    "do_sample": True,
+    "top_p": 0.9,
+    "temperature": 0.7,
+}
 # 4-bit NF4 for QLoRA-style loading (needs GPU)
+# float16 = broadest compatibility on T4/A10G GPUs in Spaces
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
 tokenizer.truncation_side = "right"
 # ---- Base model (4-bit) via Unsloth -----------------------------------------
+# IMPORTANT: Unsloth returns (model, tokenizer). UNPACK IT. :contentReference[oaicite:3]{index=3}
 model, _ = unsloth.FastLanguageModel.from_pretrained(
     model_name=BASE,
     load_in_4bit=True,
+    quantization_config=bnb_cfg,   # modern arg
     device_map="auto",
     trust_remote_code=True,
 )
 unsloth.FastLanguageModel.for_inference(model)
 # ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
 model = PeftModel.from_pretrained(model, REPO)
 model.eval()
+# ---- Text-generation pipeline (NO generate_kwargs here) ----------------------
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
 )
 # ──────────────────────────────────────────────────────────────────────────────
     # E) fallback → generate with chat history context
     prompt = _history_to_prompt(ui)
+    out = chat_pipe(prompt, **GEN_KW)[0]["generated_text"]   # ← pass gen args here
     reply = out.split("Assistant:")[-1].strip()
     memory.save_context({"input": ui}, {"output": reply})
     return reply

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-os.environ["OMP_NUM_THREADS"] = "1"  # silence OpenMP spam
 import gradio as gr
 from SLM_CService import chat_with_memory
@@ -13,7 +13,7 @@ def respond(user_message, history):
 with gr.Blocks() as demo:
     gr.Markdown("# 🛎 Customer Support Chatbot")
-    chatbot = gr.Chatbot()
     with gr.Row():
         user_in = gr.Textbox(placeholder="Type your message here...", scale=5)
         send    = gr.Button("Send", variant="primary")
@@ -21,7 +21,6 @@ with gr.Blocks() as demo:
     send.click(respond, [user_in, chatbot], [chatbot, chatbot])
     reset.click(lambda: ([], []), None, [chatbot, chatbot])
-    # Optional: submit on enter
     user_in.submit(respond, [user_in, chatbot], [chatbot, chatbot])
 if __name__ == "__main__":

 import os
+os.environ["OMP_NUM_THREADS"] = "1"
 import gradio as gr
 from SLM_CService import chat_with_memory
 with gr.Blocks() as demo:
     gr.Markdown("# 🛎 Customer Support Chatbot")
+    chatbot = gr.Chatbot(type="tuples")  # explicitly pick format; see Gradio docs. :contentReference[oaicite:5]{index=5}
     with gr.Row():
         user_in = gr.Textbox(placeholder="Type your message here...", scale=5)
         send    = gr.Button("Send", variant="primary")
     send.click(respond, [user_in, chatbot], [chatbot, chatbot])
     reset.click(lambda: ([], []), None, [chatbot, chatbot])
     user_in.submit(respond, [user_in, chatbot], [chatbot, chatbot])
 if __name__ == "__main__":