Spaces:
Running
Running
add fuzzy
Browse files
app.py
CHANGED
@@ -60,6 +60,36 @@ def normalize(text: str) -> str:
|
|
60 |
text = text.replace("เแ", "แ").replace("เเ", "แ")
|
61 |
return text.strip().lower()
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
# 🌟 Main search function
|
65 |
def search_product(query, model_name):
|
@@ -68,7 +98,7 @@ def search_product(query, model_name):
|
|
68 |
return "<p>❌ ไม่พบโมเดล</p>"
|
69 |
|
70 |
latest_query_result["raw_query"] = query
|
71 |
-
corrected_query =
|
72 |
|
73 |
query_embed = model_config[model_name]["func"](corrected_query)
|
74 |
collection_name = model_config[model_name]["collection"]
|
|
|
60 |
text = text.replace("เแ", "แ").replace("เเ", "แ")
|
61 |
return text.strip().lower()
|
62 |
|
63 |
+
def smart_tokenize(text: str) -> list:
|
64 |
+
tokens = word_tokenize(text.strip(), engine="newmm")
|
65 |
+
if not tokens or len("".join(tokens)) < len(text.strip()) * 0.5:
|
66 |
+
return [text.strip()]
|
67 |
+
return tokens
|
68 |
+
|
69 |
+
def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
|
70 |
+
query_norm = normalize(query)
|
71 |
+
tokens = smart_tokenize(query_norm)
|
72 |
+
corrected = []
|
73 |
+
i = 0
|
74 |
+
|
75 |
+
while i < len(tokens):
|
76 |
+
matched = False
|
77 |
+
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
78 |
+
phrase = "".join(tokens[i:i+n])
|
79 |
+
match, score, _ = process.extractOne(phrase, whitelist, scorer=fuzz.token_sort_ratio)
|
80 |
+
if score >= threshold:
|
81 |
+
corrected.append(match)
|
82 |
+
i += n
|
83 |
+
matched = True
|
84 |
+
break
|
85 |
+
if not matched:
|
86 |
+
corrected.append(tokens[i])
|
87 |
+
i += 1
|
88 |
+
|
89 |
+
# ✅ ตัดคำที่มีความยาว 1 ตัวอักษรและไม่ได้อยู่ใน whitelist
|
90 |
+
cleaned = [word for word in corrected if len(word) > 1 or word in whitelist]
|
91 |
+
return " ".join(cleaned)
|
92 |
+
|
93 |
|
94 |
# 🌟 Main search function
|
95 |
def search_product(query, model_name):
|
|
|
98 |
return "<p>❌ ไม่พบโมเดล</p>"
|
99 |
|
100 |
latest_query_result["raw_query"] = query
|
101 |
+
corrected_query = correct_query_merge_phrases(query,keyword_whitelist)
|
102 |
|
103 |
query_embed = model_config[model_name]["func"](corrected_query)
|
104 |
collection_name = model_config[model_name]["collection"]
|