BasilTh
commited on
Commit
Β·
ae152d5
1
Parent(s):
8eb6be4
Deploy updated SLM customer-support chatbot
Browse files- SLM_CService.py +14 -15
- app.py +2 -3
SLM_CService.py
CHANGED
@@ -10,7 +10,7 @@ os.environ["OMP_NUM_THREADS"] = "1"
|
|
10 |
os.environ.pop("HF_HUB_OFFLINE", None)
|
11 |
|
12 |
# 1) Import Unsloth BEFORE transformers/peft so its patches apply
|
13 |
-
import unsloth # must come first
|
14 |
import torch
|
15 |
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
|
16 |
from peft import PeftModel
|
@@ -21,8 +21,16 @@ from langchain.memory import ConversationBufferMemory
|
|
21 |
REPO = "ThomasBasil/bitext-qlora-tinyllama" # adapter + tokenizer at ROOT
|
22 |
BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # base model
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# 4-bit NF4 for QLoRA-style loading (needs GPU)
|
25 |
-
#
|
26 |
bnb_cfg = BitsAndBytesConfig(
|
27 |
load_in_4bit=True,
|
28 |
bnb_4bit_quant_type="nf4",
|
@@ -38,17 +46,14 @@ tokenizer.padding_side = "left"
|
|
38 |
tokenizer.truncation_side = "right"
|
39 |
|
40 |
# ---- Base model (4-bit) via Unsloth -----------------------------------------
|
41 |
-
# IMPORTANT: Unsloth returns (model, tokenizer). UNPACK IT.
|
42 |
-
# Ref: Unsloth wiki/examples show `model, tokenizer = FastLanguageModel.from_pretrained(...)`
|
43 |
-
# We'll ignore Unsloth's tokenizer since we already loaded the one from your repo.
|
44 |
model, _ = unsloth.FastLanguageModel.from_pretrained(
|
45 |
model_name=BASE,
|
46 |
load_in_4bit=True,
|
47 |
-
quantization_config=bnb_cfg, #
|
48 |
device_map="auto",
|
49 |
trust_remote_code=True,
|
50 |
)
|
51 |
-
# Optional speed patch:
|
52 |
unsloth.FastLanguageModel.for_inference(model)
|
53 |
|
54 |
# ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
|
@@ -56,19 +61,13 @@ unsloth.FastLanguageModel.for_inference(model)
|
|
56 |
model = PeftModel.from_pretrained(model, REPO)
|
57 |
model.eval()
|
58 |
|
59 |
-
# ---- Text-generation pipeline (
|
60 |
chat_pipe = pipeline(
|
61 |
"text-generation",
|
62 |
model=model,
|
63 |
tokenizer=tokenizer,
|
64 |
trust_remote_code=True,
|
65 |
return_full_text=False,
|
66 |
-
generate_kwargs={
|
67 |
-
"max_new_tokens": 128,
|
68 |
-
"do_sample": True,
|
69 |
-
"top_p": 0.9,
|
70 |
-
"temperature": 0.7,
|
71 |
-
},
|
72 |
)
|
73 |
|
74 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
@@ -163,7 +162,7 @@ def chat_with_memory(user_input: str) -> str:
|
|
163 |
|
164 |
# E) fallback β generate with chat history context
|
165 |
prompt = _history_to_prompt(ui)
|
166 |
-
out = chat_pipe(prompt)[0]["generated_text"]
|
167 |
reply = out.split("Assistant:")[-1].strip()
|
168 |
memory.save_context({"input": ui}, {"output": reply})
|
169 |
return reply
|
|
|
10 |
os.environ.pop("HF_HUB_OFFLINE", None)
|
11 |
|
12 |
# 1) Import Unsloth BEFORE transformers/peft so its patches apply
|
13 |
+
import unsloth # must come first
|
14 |
import torch
|
15 |
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
|
16 |
from peft import PeftModel
|
|
|
21 |
REPO = "ThomasBasil/bitext-qlora-tinyllama" # adapter + tokenizer at ROOT
|
22 |
BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # base model
|
23 |
|
24 |
+
# Generation params (PASS THESE AT CALL TIME β not as generate_kwargs)
|
25 |
+
GEN_KW = {
|
26 |
+
"max_new_tokens": 128,
|
27 |
+
"do_sample": True,
|
28 |
+
"top_p": 0.9,
|
29 |
+
"temperature": 0.7,
|
30 |
+
}
|
31 |
+
|
32 |
# 4-bit NF4 for QLoRA-style loading (needs GPU)
|
33 |
+
# float16 = broadest compatibility on T4/A10G GPUs in Spaces
|
34 |
bnb_cfg = BitsAndBytesConfig(
|
35 |
load_in_4bit=True,
|
36 |
bnb_4bit_quant_type="nf4",
|
|
|
46 |
tokenizer.truncation_side = "right"
|
47 |
|
48 |
# ---- Base model (4-bit) via Unsloth -----------------------------------------
|
49 |
+
# IMPORTANT: Unsloth returns (model, tokenizer). UNPACK IT. :contentReference[oaicite:3]{index=3}
|
|
|
|
|
50 |
model, _ = unsloth.FastLanguageModel.from_pretrained(
|
51 |
model_name=BASE,
|
52 |
load_in_4bit=True,
|
53 |
+
quantization_config=bnb_cfg, # modern arg
|
54 |
device_map="auto",
|
55 |
trust_remote_code=True,
|
56 |
)
|
|
|
57 |
unsloth.FastLanguageModel.for_inference(model)
|
58 |
|
59 |
# ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
|
|
|
61 |
model = PeftModel.from_pretrained(model, REPO)
|
62 |
model.eval()
|
63 |
|
64 |
+
# ---- Text-generation pipeline (NO generate_kwargs here) ----------------------
|
65 |
chat_pipe = pipeline(
|
66 |
"text-generation",
|
67 |
model=model,
|
68 |
tokenizer=tokenizer,
|
69 |
trust_remote_code=True,
|
70 |
return_full_text=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
)
|
72 |
|
73 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
162 |
|
163 |
# E) fallback β generate with chat history context
|
164 |
prompt = _history_to_prompt(ui)
|
165 |
+
out = chat_pipe(prompt, **GEN_KW)[0]["generated_text"] # β pass gen args here
|
166 |
reply = out.split("Assistant:")[-1].strip()
|
167 |
memory.save_context({"input": ui}, {"output": reply})
|
168 |
return reply
|
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
os.environ["OMP_NUM_THREADS"] = "1"
|
3 |
|
4 |
import gradio as gr
|
5 |
from SLM_CService import chat_with_memory
|
@@ -13,7 +13,7 @@ def respond(user_message, history):
|
|
13 |
|
14 |
with gr.Blocks() as demo:
|
15 |
gr.Markdown("# π Customer Support Chatbot")
|
16 |
-
chatbot = gr.Chatbot()
|
17 |
with gr.Row():
|
18 |
user_in = gr.Textbox(placeholder="Type your message here...", scale=5)
|
19 |
send = gr.Button("Send", variant="primary")
|
@@ -21,7 +21,6 @@ with gr.Blocks() as demo:
|
|
21 |
|
22 |
send.click(respond, [user_in, chatbot], [chatbot, chatbot])
|
23 |
reset.click(lambda: ([], []), None, [chatbot, chatbot])
|
24 |
-
# Optional: submit on enter
|
25 |
user_in.submit(respond, [user_in, chatbot], [chatbot, chatbot])
|
26 |
|
27 |
if __name__ == "__main__":
|
|
|
1 |
import os
|
2 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
3 |
|
4 |
import gradio as gr
|
5 |
from SLM_CService import chat_with_memory
|
|
|
13 |
|
14 |
with gr.Blocks() as demo:
|
15 |
gr.Markdown("# π Customer Support Chatbot")
|
16 |
+
chatbot = gr.Chatbot(type="tuples") # explicitly pick format; see Gradio docs. :contentReference[oaicite:5]{index=5}
|
17 |
with gr.Row():
|
18 |
user_in = gr.Textbox(placeholder="Type your message here...", scale=5)
|
19 |
send = gr.Button("Send", variant="primary")
|
|
|
21 |
|
22 |
send.click(respond, [user_in, chatbot], [chatbot, chatbot])
|
23 |
reset.click(lambda: ([], []), None, [chatbot, chatbot])
|
|
|
24 |
user_in.submit(respond, [user_in, chatbot], [chatbot, chatbot])
|
25 |
|
26 |
if __name__ == "__main__":
|