BasilTh commited on
Commit
a58eed0
Β·
1 Parent(s): ba3a0d2

Deploy updated SLM customer-support chatbot

Browse files
Files changed (2) hide show
  1. SLM_CService.py +25 -73
  2. app.py +0 -1
SLM_CService.py CHANGED
@@ -4,15 +4,13 @@
4
  import os
5
  import re
6
 
7
- # Keep OpenMP quiet in Spaces logs
8
  os.environ["OMP_NUM_THREADS"] = "1"
9
- # Ensure we don't accidentally run offline
10
  os.environ.pop("HF_HUB_OFFLINE", None)
11
 
12
- # 1) Unsloth must be imported BEFORE transformers/peft to apply optimizations.
13
- # (Otherwise you may see perf/memory warnings.)
14
- # Ref: Unsloth team warning in issues.
15
- import unsloth # noqa: E402 # must be before transformers/peft :contentReference[oaicite:2]{index=2}
16
 
17
  import torch
18
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
@@ -20,86 +18,42 @@ from peft import PeftModel
20
  from langchain.memory import ConversationBufferMemory
21
 
22
  # ──────────────────────────────────────────────────────────────────────────────
23
- # Your Hub repo that contains the tokenizer + PEFT adapter files
24
- REPO = "ThomasBasil/bitext-qlora-tinyllama"
25
- BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
26
-
27
- # If your files are nested, set this to the exact subfolder path (or use
28
- # the HF_SUBFOLDER env var from Space β†’ Settings β†’ Variables).
29
- # Example from your screenshot:
30
- DEFAULT_SUBFOLDER = "bitext-qlora-tinyllama"
31
- SUBFOLDER = os.environ.get("HF_SUBFOLDER", DEFAULT_SUBFOLDER)
32
-
33
- # 4-bit NF4 quantization config (QLoRA-style)
34
- # Ref: Transformers bitsandbytes quantization docs. :contentReference[oaicite:3]{index=3}
35
  bnb_cfg = BitsAndBytesConfig(
36
  load_in_4bit=True,
37
  bnb_4bit_quant_type="nf4",
38
  bnb_4bit_use_double_quant=True,
39
- bnb_4bit_compute_dtype=torch.bfloat16,
 
40
  )
41
 
42
- # ---- Robust helpers to load from root or subfolder ---------------------------
43
-
44
- def _load_tokenizer(repo_id: str):
45
- """
46
- Try to load tokenizer from repo root; if missing, try configured subfolder.
47
- Transformers supports `subfolder` in from_pretrained for tokenizers. :contentReference[oaicite:4]{index=4}
48
- """
49
- # Try at repo root first
50
- try:
51
- tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
52
- except Exception:
53
- # Try "tokenizer" subdir at root
54
- try:
55
- tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
56
- except Exception:
57
- # Try the provided nested path
58
- tok = AutoTokenizer.from_pretrained(repo_id, subfolder=SUBFOLDER, use_fast=False)
59
-
60
- # sensible defaults for causal LM
61
- if tok.pad_token_id is None and tok.eos_token_id is not None:
62
- tok.pad_token_id = tok.eos_token_id
63
- tok.padding_side = "left"
64
- tok.truncation_side = "right"
65
- return tok
66
-
67
-
68
- def _attach_adapter(base_model, repo_id: str):
69
- """
70
- Attach PEFT adapter from root; if not found, try subfolder variants.
71
- (PEFT supports kwargs like `subfolder`, though older versions had quirks;
72
- if you ever hit issues, place adapter files at repo root.) :contentReference[oaicite:5]{index=5}
73
- """
74
- # Try repo root
75
- try:
76
- return PeftModel.from_pretrained(base_model, repo_id)
77
- except Exception:
78
- # Try 'adapter' subdir at root
79
- try:
80
- return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
81
- except Exception:
82
- # Try the provided nested path
83
- return PeftModel.from_pretrained(base_model, repo_id, subfolder=SUBFOLDER)
84
-
85
-
86
- # ---- Load tokenizer, base model (4-bit), and attach adapter ------------------
87
-
88
- tokenizer = _load_tokenizer(REPO)
89
 
 
90
  model = unsloth.FastLanguageModel.from_pretrained(
91
  BASE,
92
  load_in_4bit=True,
93
- quantization_config=bnb_cfg, # prefer quantization_config over legacy args
94
  device_map="auto",
95
  trust_remote_code=True,
96
  )
97
 
98
- model = _attach_adapter(model, REPO)
 
 
99
  model.eval()
100
 
101
- # Transformers pipeline accepts `generate_kwargs` to pass through to .generate().
102
- # Ref: Pipelines docs mention `generate_kwargs`. :contentReference[oaicite:6]{index=6}
103
  chat_pipe = pipeline(
104
  "text-generation",
105
  model=model,
@@ -141,13 +95,11 @@ def handle_escalation(_=None):
141
  stored_order = None
142
  pending_intent = None
143
 
144
-
145
  def _history_to_prompt(user_input: str) -> str:
146
  """Build a plain-text prompt that includes chat history for fallback generation."""
147
  hist = memory.load_memory_variables({}).get("chat_history", [])
148
  prompt = "You are a helpful support assistant.\n"
149
  for msg in hist:
150
- # LangChain messages often have .type ('human'/'ai') and .content
151
  mtype = getattr(msg, "type", "")
152
  role = "User" if mtype == "human" else "Assistant"
153
  content = getattr(msg, "content", "")
@@ -155,7 +107,6 @@ def _history_to_prompt(user_input: str) -> str:
155
  prompt += f"User: {user_input}\nAssistant: "
156
  return prompt
157
 
158
-
159
  def chat_with_memory(user_input: str) -> str:
160
  """Main entrypoint called by app.py."""
161
  global stored_order, pending_intent
@@ -213,3 +164,4 @@ def chat_with_memory(user_input: str) -> str:
213
  reply = out.split("Assistant:")[-1].strip()
214
  memory.save_context({"input": ui}, {"output": reply})
215
  return reply
 
 
4
  import os
5
  import re
6
 
7
+ # Calm OpenMP noise in Spaces logs
8
  os.environ["OMP_NUM_THREADS"] = "1"
9
+ # Don’t accidentally run offline
10
  os.environ.pop("HF_HUB_OFFLINE", None)
11
 
12
+ # 1) Import Unsloth BEFORE transformers/peft so its patches apply
13
+ import unsloth # noqa: E402
 
 
14
 
15
  import torch
16
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 
18
  from langchain.memory import ConversationBufferMemory
19
 
20
  # ──────────────────────────────────────────────────────────────────────────────
21
+ # Your Hub repos
22
+ REPO = "ThomasBasil/bitext-qlora-tinyllama" # <-- your adapter + tokenizer (AT ROOT)
23
+ BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # base model
24
+
25
+ # 4-bit NF4 for QLoRA-style loading
 
 
 
 
 
 
 
26
  bnb_cfg = BitsAndBytesConfig(
27
  load_in_4bit=True,
28
  bnb_4bit_quant_type="nf4",
29
  bnb_4bit_use_double_quant=True,
30
+ # float16 is broadly compatible with T4/A10G Spaces
31
+ bnb_4bit_compute_dtype=torch.float16,
32
  )
33
 
34
+ # ---- Tokenizer (from repo ROOT) ---------------------------------------------
35
+ # Your repo root contains: tokenizer.model / tokenizer_config.json / special_tokens_map.json
36
+ tokenizer = AutoTokenizer.from_pretrained(REPO, use_fast=False)
37
+ if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
38
+ tokenizer.pad_token_id = tokenizer.eos_token_id
39
+ tokenizer.padding_side = "left"
40
+ tokenizer.truncation_side = "right"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # ---- Base model (4-bit) via Unsloth -----------------------------------------
43
  model = unsloth.FastLanguageModel.from_pretrained(
44
  BASE,
45
  load_in_4bit=True,
46
+ quantization_config=bnb_cfg,
47
  device_map="auto",
48
  trust_remote_code=True,
49
  )
50
 
51
+ # ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
52
+ # Your repo root contains: adapter_config.json + adapter_model.safetensors
53
+ model = PeftModel.from_pretrained(model, REPO)
54
  model.eval()
55
 
56
+ # ---- Text-generation pipeline (use generate_kwargs) --------------------------
 
57
  chat_pipe = pipeline(
58
  "text-generation",
59
  model=model,
 
95
  stored_order = None
96
  pending_intent = None
97
 
 
98
  def _history_to_prompt(user_input: str) -> str:
99
  """Build a plain-text prompt that includes chat history for fallback generation."""
100
  hist = memory.load_memory_variables({}).get("chat_history", [])
101
  prompt = "You are a helpful support assistant.\n"
102
  for msg in hist:
 
103
  mtype = getattr(msg, "type", "")
104
  role = "User" if mtype == "human" else "Assistant"
105
  content = getattr(msg, "content", "")
 
107
  prompt += f"User: {user_input}\nAssistant: "
108
  return prompt
109
 
 
110
  def chat_with_memory(user_input: str) -> str:
111
  """Main entrypoint called by app.py."""
112
  global stored_order, pending_intent
 
164
  reply = out.split("Assistant:")[-1].strip()
165
  memory.save_context({"input": ui}, {"output": reply})
166
  return reply
167
+
app.py CHANGED
@@ -26,4 +26,3 @@ with gr.Blocks() as demo:
26
 
27
  if __name__ == "__main__":
28
  demo.launch(server_name="0.0.0.0", server_port=7860)
29
-
 
26
 
27
  if __name__ == "__main__":
28
  demo.launch(server_name="0.0.0.0", server_port=7860)