Spaces:

sibthinon
/

environment

Running

App Files Files Community

sibthinon commited on May 27

Commit

95cb86f

verified ·

1 Parent(s): 5fdfe78

remove syspell and add rapidfuzz

Browse files

Files changed (1) hide show

app.py +33 -24

app.py CHANGED Viewed

@@ -6,10 +6,13 @@ from sentence_transformers import SentenceTransformer
 from qdrant_client import QdrantClient
 from qdrant_client.models import Filter, FieldCondition, MatchValue
 import os
-from symspellpy.symspellpy import SymSpell, Verbosity
 from pythainlp.tokenize import word_tokenize
 from pyairtable import Table
 from pyairtable import Api
 qdrant_client = QdrantClient(
@@ -49,32 +52,38 @@ model_config = {
 # Global memory to hold feedback state
 latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
-symspell = SymSpell(max_dictionary_edit_distance=2)
-symspell.load_pickle("symspell_fast.pkl")
-# แก้คำผิด
-def correct_query_with_symspell(query: str) -> str:
-    # ถ้าคำเดียว → ใช้ lookup ปกติ (ดีที่สุด)
-    if len(query.strip().split()) == 1:
-        suggestions = symspell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2)
-        return suggestions[0].term if suggestions else query
-    # ตัดคำ
-    words = word_tokenize(query.strip(), engine="newmm")
-    corrected = []
-    for word in words:
-        # หากความยาวคำเดิม > 4 และแก้ไม่ได้ → ลองแก้ทั้งคำเดิมแทน
-        suggestions = symspell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
-        if suggestions:
-            corrected.append(suggestions[0].term)
         else:
-            # ลองแก้ทั้ง word แบบ raw (กรณี word ถูกตัดผิด เช่น "ปิดปอง")
-            alt_suggestions = symspell.lookup_compound(word, 2)
-            if alt_suggestions and alt_suggestions[0].term != word:
-                corrected.append(alt_suggestions[0].term)
-            else:
-                corrected.append(word)
     return " ".join(corrected)
@@ -85,7 +94,7 @@ def search_product(query, model_name):
         return "<p>❌ ไม่พบโมเดล</p>"
     latest_query_result["raw_query"] = query
-    corrected_query = correct_query_with_symspell(query)
     query_embed = model_config[model_name]["func"](corrected_query)
     collection_name = model_config[model_name]["collection"]

 from qdrant_client import QdrantClient
 from qdrant_client.models import Filter, FieldCondition, MatchValue
 import os
+from rapidfuzz import process, fuzz
 from pythainlp.tokenize import word_tokenize
 from pyairtable import Table
 from pyairtable import Api
+import pickle
+import re
+import unicodedata
 qdrant_client = QdrantClient(
 # Global memory to hold feedback state
 latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
+with open("keyword_whitelist.pkl", "rb") as f:
+    keyword_whitelist = pickle.load(f)
+def smart_tokenize(query: str) -> list:
+    tokens = word_tokenize(query, engine="newmm")
+    if len("".join(tokens)) < len(query) * 0.7:  # ตัดคำขาดเกินไป
+        return query.strip().split()
+    return tokens
+def normalize_and_clean_thai(text: str) -> str:
+    text = unicodedata.normalize("NFC", text)
+    text = text.replace("เแ", "แ").replace("เเ", "แ")
+    return text
+def correct_query_smart(query: str, whitelist=None, threshold=70) -> str:
+    query_norm = normalize_and_clean_thai(query)
+    tokens = query_norm.strip().split()
+    # ถ้า token เดียว → fuzzy ตรงไปที่คำเต็มเลย
+    if len(tokens) == 1:
+        match, score, _ = process.extractOne(tokens[0].lower(), whitelist, scorer=fuzz.token_sort_ratio)
+        return match if score >= threshold else query_norm
+    # token หลายคำ → ลองแก้ทีละคำ
+    corrected = []
+    for word in tokens:
+        word_lower = word.lower()
+        if word_lower in whitelist:
+            corrected.append(word)
         else:
+            match, score, _ = process.extractOne(word_lower, whitelist, scorer=fuzz.token_sort_ratio)
+            corrected.append(match if score >= threshold else word)
     return " ".join(corrected)
         return "<p>❌ ไม่พบโมเดล</p>"
     latest_query_result["raw_query"] = query
+    corrected_query = correct_query_smart(query,keyword_whitelist)
     query_embed = model_config[model_name]["func"](corrected_query)
     collection_name = model_config[model_name]["collection"]