sibthinon commited on
Commit
50c341d
·
verified ·
1 Parent(s): dcf6a26
Files changed (1) hide show
  1. app.py +31 -1
app.py CHANGED
@@ -60,6 +60,36 @@ def normalize(text: str) -> str:
60
  text = text.replace("เแ", "แ").replace("เเ", "แ")
61
  return text.strip().lower()
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  # 🌟 Main search function
65
  def search_product(query, model_name):
@@ -68,7 +98,7 @@ def search_product(query, model_name):
68
  return "<p>❌ ไม่พบโมเดล</p>"
69
 
70
  latest_query_result["raw_query"] = query
71
- corrected_query = normalize(query)
72
 
73
  query_embed = model_config[model_name]["func"](corrected_query)
74
  collection_name = model_config[model_name]["collection"]
 
60
  text = text.replace("เแ", "แ").replace("เเ", "แ")
61
  return text.strip().lower()
62
 
63
+ def smart_tokenize(text: str) -> list:
64
+ tokens = word_tokenize(text.strip(), engine="newmm")
65
+ if not tokens or len("".join(tokens)) < len(text.strip()) * 0.5:
66
+ return [text.strip()]
67
+ return tokens
68
+
69
+ def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
70
+ query_norm = normalize(query)
71
+ tokens = smart_tokenize(query_norm)
72
+ corrected = []
73
+ i = 0
74
+
75
+ while i < len(tokens):
76
+ matched = False
77
+ for n in range(min(max_ngram, len(tokens) - i), 0, -1):
78
+ phrase = "".join(tokens[i:i+n])
79
+ match, score, _ = process.extractOne(phrase, whitelist, scorer=fuzz.token_sort_ratio)
80
+ if score >= threshold:
81
+ corrected.append(match)
82
+ i += n
83
+ matched = True
84
+ break
85
+ if not matched:
86
+ corrected.append(tokens[i])
87
+ i += 1
88
+
89
+ # ✅ ตัดคำที่มีความยาว 1 ตัวอักษรและไม่ได้อยู่ใน whitelist
90
+ cleaned = [word for word in corrected if len(word) > 1 or word in whitelist]
91
+ return " ".join(cleaned)
92
+
93
 
94
  # 🌟 Main search function
95
  def search_product(query, model_name):
 
98
  return "<p>❌ ไม่พบโมเดล</p>"
99
 
100
  latest_query_result["raw_query"] = query
101
+ corrected_query = correct_query_merge_phrases(query,keyword_whitelist)
102
 
103
  query_embed = model_config[model_name]["func"](corrected_query)
104
  collection_name = model_config[model_name]["collection"]