sibthinon commited on
Commit
95cb86f
·
verified ·
1 Parent(s): 5fdfe78

remove syspell and add rapidfuzz

Browse files
Files changed (1) hide show
  1. app.py +33 -24
app.py CHANGED
@@ -6,10 +6,13 @@ from sentence_transformers import SentenceTransformer
6
  from qdrant_client import QdrantClient
7
  from qdrant_client.models import Filter, FieldCondition, MatchValue
8
  import os
9
- from symspellpy.symspellpy import SymSpell, Verbosity
10
  from pythainlp.tokenize import word_tokenize
11
  from pyairtable import Table
12
  from pyairtable import Api
 
 
 
13
 
14
 
15
  qdrant_client = QdrantClient(
@@ -49,32 +52,38 @@ model_config = {
49
  # Global memory to hold feedback state
50
  latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
51
 
52
- symspell = SymSpell(max_dictionary_edit_distance=2)
53
- symspell.load_pickle("symspell_fast.pkl")
54
 
55
- # แก้คำผิด
56
- def correct_query_with_symspell(query: str) -> str:
57
- # ถ้าคำเดียว ใช้ lookup ปกติ (ดีที่สุด)
58
- if len(query.strip().split()) == 1:
59
- suggestions = symspell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2)
60
- return suggestions[0].term if suggestions else query
61
 
62
- # ตัดคำ
63
- words = word_tokenize(query.strip(), engine="newmm")
64
- corrected = []
 
 
 
 
 
65
 
66
- for word in words:
67
- # หากความยาวคำเดิม > 4 และแก้ไม่ได้ → ลองแก้ทั้งคำเดิมแทน
68
- suggestions = symspell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
69
- if suggestions:
70
- corrected.append(suggestions[0].term)
 
 
 
 
 
 
71
  else:
72
- # ลองแก้ทั้ง word แบบ raw (กรณี word ถูกตัดผิด เช่น "ปิดปอง")
73
- alt_suggestions = symspell.lookup_compound(word, 2)
74
- if alt_suggestions and alt_suggestions[0].term != word:
75
- corrected.append(alt_suggestions[0].term)
76
- else:
77
- corrected.append(word)
78
 
79
  return " ".join(corrected)
80
 
@@ -85,7 +94,7 @@ def search_product(query, model_name):
85
  return "<p>❌ ไม่พบโมเดล</p>"
86
 
87
  latest_query_result["raw_query"] = query
88
- corrected_query = correct_query_with_symspell(query)
89
 
90
  query_embed = model_config[model_name]["func"](corrected_query)
91
  collection_name = model_config[model_name]["collection"]
 
6
  from qdrant_client import QdrantClient
7
  from qdrant_client.models import Filter, FieldCondition, MatchValue
8
  import os
9
+ from rapidfuzz import process, fuzz
10
  from pythainlp.tokenize import word_tokenize
11
  from pyairtable import Table
12
  from pyairtable import Api
13
+ import pickle
14
+ import re
15
+ import unicodedata
16
 
17
 
18
  qdrant_client = QdrantClient(
 
52
  # Global memory to hold feedback state
53
  latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
54
 
55
+ with open("keyword_whitelist.pkl", "rb") as f:
56
+ keyword_whitelist = pickle.load(f)
57
 
58
+ def smart_tokenize(query: str) -> list:
59
+ tokens = word_tokenize(query, engine="newmm")
60
+ if len("".join(tokens)) < len(query) * 0.7: # ตัดคำขาดเกินไป
61
+ return query.strip().split()
62
+ return tokens
 
63
 
64
+ def normalize_and_clean_thai(text: str) -> str:
65
+ text = unicodedata.normalize("NFC", text)
66
+ text = text.replace("เแ", "แ").replace("เเ", "แ")
67
+ return text
68
+
69
+ def correct_query_smart(query: str, whitelist=None, threshold=70) -> str:
70
+ query_norm = normalize_and_clean_thai(query)
71
+ tokens = query_norm.strip().split()
72
 
73
+ # ถ้า token เดียว → fuzzy ตรงไปที่คำเต็มเลย
74
+ if len(tokens) == 1:
75
+ match, score, _ = process.extractOne(tokens[0].lower(), whitelist, scorer=fuzz.token_sort_ratio)
76
+ return match if score >= threshold else query_norm
77
+
78
+ # token หลายคำ → ลองแก้ทีละคำ
79
+ corrected = []
80
+ for word in tokens:
81
+ word_lower = word.lower()
82
+ if word_lower in whitelist:
83
+ corrected.append(word)
84
  else:
85
+ match, score, _ = process.extractOne(word_lower, whitelist, scorer=fuzz.token_sort_ratio)
86
+ corrected.append(match if score >= threshold else word)
 
 
 
 
87
 
88
  return " ".join(corrected)
89
 
 
94
  return "<p>❌ ไม่พบโมเดล</p>"
95
 
96
  latest_query_result["raw_query"] = query
97
+ corrected_query = correct_query_smart(query,keyword_whitelist)
98
 
99
  query_embed = model_config[model_name]["func"](corrected_query)
100
  collection_name = model_config[model_name]["collection"]