BasilTh commited on
Commit
ae152d5
Β·
1 Parent(s): 8eb6be4

Deploy updated SLM customer-support chatbot

Browse files
Files changed (2) hide show
  1. SLM_CService.py +14 -15
  2. app.py +2 -3
SLM_CService.py CHANGED
@@ -10,7 +10,7 @@ os.environ["OMP_NUM_THREADS"] = "1"
10
  os.environ.pop("HF_HUB_OFFLINE", None)
11
 
12
  # 1) Import Unsloth BEFORE transformers/peft so its patches apply
13
- import unsloth # must come first (Unsloth docs recommend this)
14
  import torch
15
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
16
  from peft import PeftModel
@@ -21,8 +21,16 @@ from langchain.memory import ConversationBufferMemory
21
  REPO = "ThomasBasil/bitext-qlora-tinyllama" # adapter + tokenizer at ROOT
22
  BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # base model
23
 
 
 
 
 
 
 
 
 
24
  # 4-bit NF4 for QLoRA-style loading (needs GPU)
25
- # Using float16 here for broader GPU compatibility on Spaces (T4/A10G).
26
  bnb_cfg = BitsAndBytesConfig(
27
  load_in_4bit=True,
28
  bnb_4bit_quant_type="nf4",
@@ -38,17 +46,14 @@ tokenizer.padding_side = "left"
38
  tokenizer.truncation_side = "right"
39
 
40
  # ---- Base model (4-bit) via Unsloth -----------------------------------------
41
- # IMPORTANT: Unsloth returns (model, tokenizer). UNPACK IT.
42
- # Ref: Unsloth wiki/examples show `model, tokenizer = FastLanguageModel.from_pretrained(...)`
43
- # We'll ignore Unsloth's tokenizer since we already loaded the one from your repo.
44
  model, _ = unsloth.FastLanguageModel.from_pretrained(
45
  model_name=BASE,
46
  load_in_4bit=True,
47
- quantization_config=bnb_cfg, # preferred modern arg
48
  device_map="auto",
49
  trust_remote_code=True,
50
  )
51
- # Optional speed patch:
52
  unsloth.FastLanguageModel.for_inference(model)
53
 
54
  # ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
@@ -56,19 +61,13 @@ unsloth.FastLanguageModel.for_inference(model)
56
  model = PeftModel.from_pretrained(model, REPO)
57
  model.eval()
58
 
59
- # ---- Text-generation pipeline (use generate_kwargs) --------------------------
60
  chat_pipe = pipeline(
61
  "text-generation",
62
  model=model,
63
  tokenizer=tokenizer,
64
  trust_remote_code=True,
65
  return_full_text=False,
66
- generate_kwargs={
67
- "max_new_tokens": 128,
68
- "do_sample": True,
69
- "top_p": 0.9,
70
- "temperature": 0.7,
71
- },
72
  )
73
 
74
  # ──────────────────────────────────────────────────────────────────────────────
@@ -163,7 +162,7 @@ def chat_with_memory(user_input: str) -> str:
163
 
164
  # E) fallback β†’ generate with chat history context
165
  prompt = _history_to_prompt(ui)
166
- out = chat_pipe(prompt)[0]["generated_text"]
167
  reply = out.split("Assistant:")[-1].strip()
168
  memory.save_context({"input": ui}, {"output": reply})
169
  return reply
 
10
  os.environ.pop("HF_HUB_OFFLINE", None)
11
 
12
  # 1) Import Unsloth BEFORE transformers/peft so its patches apply
13
+ import unsloth # must come first
14
  import torch
15
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
16
  from peft import PeftModel
 
21
  REPO = "ThomasBasil/bitext-qlora-tinyllama" # adapter + tokenizer at ROOT
22
  BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # base model
23
 
24
+ # Generation params (PASS THESE AT CALL TIME β€” not as generate_kwargs)
25
+ GEN_KW = {
26
+ "max_new_tokens": 128,
27
+ "do_sample": True,
28
+ "top_p": 0.9,
29
+ "temperature": 0.7,
30
+ }
31
+
32
  # 4-bit NF4 for QLoRA-style loading (needs GPU)
33
+ # float16 = broadest compatibility on T4/A10G GPUs in Spaces
34
  bnb_cfg = BitsAndBytesConfig(
35
  load_in_4bit=True,
36
  bnb_4bit_quant_type="nf4",
 
46
  tokenizer.truncation_side = "right"
47
 
48
  # ---- Base model (4-bit) via Unsloth -----------------------------------------
49
+ # IMPORTANT: Unsloth returns (model, tokenizer). UNPACK IT. :contentReference[oaicite:3]{index=3}
 
 
50
  model, _ = unsloth.FastLanguageModel.from_pretrained(
51
  model_name=BASE,
52
  load_in_4bit=True,
53
+ quantization_config=bnb_cfg, # modern arg
54
  device_map="auto",
55
  trust_remote_code=True,
56
  )
 
57
  unsloth.FastLanguageModel.for_inference(model)
58
 
59
  # ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
 
61
  model = PeftModel.from_pretrained(model, REPO)
62
  model.eval()
63
 
64
+ # ---- Text-generation pipeline (NO generate_kwargs here) ----------------------
65
  chat_pipe = pipeline(
66
  "text-generation",
67
  model=model,
68
  tokenizer=tokenizer,
69
  trust_remote_code=True,
70
  return_full_text=False,
 
 
 
 
 
 
71
  )
72
 
73
  # ──────────────────────────────────────────────────────────────────────────────
 
162
 
163
  # E) fallback β†’ generate with chat history context
164
  prompt = _history_to_prompt(ui)
165
+ out = chat_pipe(prompt, **GEN_KW)[0]["generated_text"] # ← pass gen args here
166
  reply = out.split("Assistant:")[-1].strip()
167
  memory.save_context({"input": ui}, {"output": reply})
168
  return reply
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- os.environ["OMP_NUM_THREADS"] = "1" # silence OpenMP spam
3
 
4
  import gradio as gr
5
  from SLM_CService import chat_with_memory
@@ -13,7 +13,7 @@ def respond(user_message, history):
13
 
14
  with gr.Blocks() as demo:
15
  gr.Markdown("# πŸ›Ž Customer Support Chatbot")
16
- chatbot = gr.Chatbot()
17
  with gr.Row():
18
  user_in = gr.Textbox(placeholder="Type your message here...", scale=5)
19
  send = gr.Button("Send", variant="primary")
@@ -21,7 +21,6 @@ with gr.Blocks() as demo:
21
 
22
  send.click(respond, [user_in, chatbot], [chatbot, chatbot])
23
  reset.click(lambda: ([], []), None, [chatbot, chatbot])
24
- # Optional: submit on enter
25
  user_in.submit(respond, [user_in, chatbot], [chatbot, chatbot])
26
 
27
  if __name__ == "__main__":
 
1
  import os
2
+ os.environ["OMP_NUM_THREADS"] = "1"
3
 
4
  import gradio as gr
5
  from SLM_CService import chat_with_memory
 
13
 
14
  with gr.Blocks() as demo:
15
  gr.Markdown("# πŸ›Ž Customer Support Chatbot")
16
+ chatbot = gr.Chatbot(type="tuples") # explicitly pick format; see Gradio docs. :contentReference[oaicite:5]{index=5}
17
  with gr.Row():
18
  user_in = gr.Textbox(placeholder="Type your message here...", scale=5)
19
  send = gr.Button("Send", variant="primary")
 
21
 
22
  send.click(respond, [user_in, chatbot], [chatbot, chatbot])
23
  reset.click(lambda: ([], []), None, [chatbot, chatbot])
 
24
  user_in.submit(respond, [user_in, chatbot], [chatbot, chatbot])
25
 
26
  if __name__ == "__main__":