Spaces:

ThomasBasil
/

slm-customer-support-chatbot

Paused

App Files Files Community

BasilTh commited on Aug 7

Commit

a58eed0

1 Parent(s): ba3a0d2

Deploy updated SLM customer-support chatbot

Browse files

Files changed (2) hide show

SLM_CService.py +25 -73
app.py +0 -1

SLM_CService.py CHANGED Viewed

@@ -4,15 +4,13 @@
 import os
 import re
-# Keep OpenMP quiet in Spaces logs
 os.environ["OMP_NUM_THREADS"] = "1"
-# Ensure we don't accidentally run offline
 os.environ.pop("HF_HUB_OFFLINE", None)
-# 1) Unsloth must be imported BEFORE transformers/peft to apply optimizations.
-#    (Otherwise you may see perf/memory warnings.)
-#    Ref: Unsloth team warning in issues.
-import unsloth  # noqa: E402  # must be before transformers/peft  :contentReference[oaicite:2]{index=2}
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
@@ -20,86 +18,42 @@ from peft import PeftModel
 from langchain.memory import ConversationBufferMemory
 # ──────────────────────────────────────────────────────────────────────────────
-# Your Hub repo that contains the tokenizer + PEFT adapter files
-REPO = "ThomasBasil/bitext-qlora-tinyllama"
-BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-# If your files are nested, set this to the exact subfolder path (or use
-# the HF_SUBFOLDER env var from Space → Settings → Variables).
-# Example from your screenshot:
-DEFAULT_SUBFOLDER = "bitext-qlora-tinyllama"
-SUBFOLDER = os.environ.get("HF_SUBFOLDER", DEFAULT_SUBFOLDER)
-# 4-bit NF4 quantization config (QLoRA-style)
-# Ref: Transformers bitsandbytes quantization docs.  :contentReference[oaicite:3]{index=3}
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
 )
-# ---- Robust helpers to load from root or subfolder ---------------------------
-def _load_tokenizer(repo_id: str):
-    """
-    Try to load tokenizer from repo root; if missing, try configured subfolder.
-    Transformers supports `subfolder` in from_pretrained for tokenizers. :contentReference[oaicite:4]{index=4}
-    """
-    # Try at repo root first
-    try:
-        tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
-    except Exception:
-        # Try "tokenizer" subdir at root
-        try:
-            tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
-        except Exception:
-            # Try the provided nested path
-            tok = AutoTokenizer.from_pretrained(repo_id, subfolder=SUBFOLDER, use_fast=False)
-    # sensible defaults for causal LM
-    if tok.pad_token_id is None and tok.eos_token_id is not None:
-        tok.pad_token_id = tok.eos_token_id
-    tok.padding_side = "left"
-    tok.truncation_side = "right"
-    return tok
-def _attach_adapter(base_model, repo_id: str):
-    """
-    Attach PEFT adapter from root; if not found, try subfolder variants.
-    (PEFT supports kwargs like `subfolder`, though older versions had quirks;
-     if you ever hit issues, place adapter files at repo root.) :contentReference[oaicite:5]{index=5}
-    """
-    # Try repo root
-    try:
-        return PeftModel.from_pretrained(base_model, repo_id)
-    except Exception:
-        # Try 'adapter' subdir at root
-        try:
-            return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
-        except Exception:
-            # Try the provided nested path
-            return PeftModel.from_pretrained(base_model, repo_id, subfolder=SUBFOLDER)
-# ---- Load tokenizer, base model (4-bit), and attach adapter ------------------
-tokenizer = _load_tokenizer(REPO)
 model = unsloth.FastLanguageModel.from_pretrained(
     BASE,
     load_in_4bit=True,
-    quantization_config=bnb_cfg,   # prefer quantization_config over legacy args
     device_map="auto",
     trust_remote_code=True,
 )
-model = _attach_adapter(model, REPO)
 model.eval()
-# Transformers pipeline accepts `generate_kwargs` to pass through to .generate().
-# Ref: Pipelines docs mention `generate_kwargs`. :contentReference[oaicite:6]{index=6}
 chat_pipe = pipeline(
     "text-generation",
     model=model,
@@ -141,13 +95,11 @@ def handle_escalation(_=None):
 stored_order   = None
 pending_intent = None
 def _history_to_prompt(user_input: str) -> str:
     """Build a plain-text prompt that includes chat history for fallback generation."""
     hist = memory.load_memory_variables({}).get("chat_history", [])
     prompt = "You are a helpful support assistant.\n"
     for msg in hist:
-        # LangChain messages often have .type ('human'/'ai') and .content
         mtype = getattr(msg, "type", "")
         role  = "User" if mtype == "human" else "Assistant"
         content = getattr(msg, "content", "")
@@ -155,7 +107,6 @@ def _history_to_prompt(user_input: str) -> str:
     prompt += f"User: {user_input}\nAssistant: "
     return prompt
 def chat_with_memory(user_input: str) -> str:
     """Main entrypoint called by app.py."""
     global stored_order, pending_intent
@@ -213,3 +164,4 @@ def chat_with_memory(user_input: str) -> str:
     reply = out.split("Assistant:")[-1].strip()
     memory.save_context({"input": ui}, {"output": reply})
     return reply

 import os
 import re
+# Calm OpenMP noise in Spaces logs
 os.environ["OMP_NUM_THREADS"] = "1"
+# Don’t accidentally run offline
 os.environ.pop("HF_HUB_OFFLINE", None)
+# 1) Import Unsloth BEFORE transformers/peft so its patches apply
+import unsloth  # noqa: E402
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from langchain.memory import ConversationBufferMemory
 # ──────────────────────────────────────────────────────────────────────────────
+# Your Hub repos
+REPO = "ThomasBasil/bitext-qlora-tinyllama"         # <-- your adapter + tokenizer (AT ROOT)
+BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"         # base model
+# 4-bit NF4 for QLoRA-style loading
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
+    # float16 is broadly compatible with T4/A10G Spaces
+    bnb_4bit_compute_dtype=torch.float16,
 )
+# ---- Tokenizer (from repo ROOT) ---------------------------------------------
+# Your repo root contains: tokenizer.model / tokenizer_config.json / special_tokens_map.json
+tokenizer = AutoTokenizer.from_pretrained(REPO, use_fast=False)
+if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+tokenizer.padding_side = "left"
+tokenizer.truncation_side = "right"
+# ---- Base model (4-bit) via Unsloth -----------------------------------------
 model = unsloth.FastLanguageModel.from_pretrained(
     BASE,
     load_in_4bit=True,
+    quantization_config=bnb_cfg,
     device_map="auto",
     trust_remote_code=True,
 )
+# ---- Attach your PEFT adapter (from repo ROOT) -------------------------------
+# Your repo root contains: adapter_config.json + adapter_model.safetensors
+model = PeftModel.from_pretrained(model, REPO)
 model.eval()
+# ---- Text-generation pipeline (use generate_kwargs) --------------------------
 chat_pipe = pipeline(
     "text-generation",
     model=model,
 stored_order   = None
 pending_intent = None
 def _history_to_prompt(user_input: str) -> str:
     """Build a plain-text prompt that includes chat history for fallback generation."""
     hist = memory.load_memory_variables({}).get("chat_history", [])
     prompt = "You are a helpful support assistant.\n"
     for msg in hist:
         mtype = getattr(msg, "type", "")
         role  = "User" if mtype == "human" else "Assistant"
         content = getattr(msg, "content", "")
     prompt += f"User: {user_input}\nAssistant: "
     return prompt
 def chat_with_memory(user_input: str) -> str:
     """Main entrypoint called by app.py."""
     global stored_order, pending_intent
     reply = out.split("Assistant:")[-1].strip()
     memory.save_context({"input": ui}, {"output": reply})
     return reply

app.py CHANGED Viewed

@@ -26,4 +26,3 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)


26
27	if __name__ == "__main__":
28	demo.launch(server_name="0.0.0.0", server_port=7860)