Spaces:
Running
Running
update fuzzy method
Browse files
app.py
CHANGED
@@ -37,7 +37,12 @@ with open("keyword_whitelist.pkl", "rb") as f:
|
|
37 |
keyword_whitelist = pickle.load(f)
|
38 |
|
39 |
# Utils
|
|
|
|
|
|
|
40 |
def normalize(text: str) -> str:
|
|
|
|
|
41 |
text = unicodedata.normalize("NFC", text)
|
42 |
return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
|
43 |
|
@@ -54,7 +59,17 @@ def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3
|
|
54 |
matched = False
|
55 |
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
56 |
phrase = "".join(tokens[i:i+n])
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
if score >= threshold:
|
59 |
corrected.append(match)
|
60 |
i += n
|
|
|
37 |
keyword_whitelist = pickle.load(f)
|
38 |
|
39 |
# Utils
|
40 |
+
def is_non_thai(text):
|
41 |
+
return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
|
42 |
+
|
43 |
def normalize(text: str) -> str:
|
44 |
+
if is_non_thai(text):
|
45 |
+
return text.strip()
|
46 |
text = unicodedata.normalize("NFC", text)
|
47 |
return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
|
48 |
|
|
|
59 |
matched = False
|
60 |
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
61 |
phrase = "".join(tokens[i:i+n])
|
62 |
+
if phrase in whitelist:
|
63 |
+
corrected.append(phrase)
|
64 |
+
i += n
|
65 |
+
matched = True
|
66 |
+
break
|
67 |
+
match, score, _ = process.extractOne(
|
68 |
+
phrase,
|
69 |
+
whitelist,
|
70 |
+
scorer=fuzz.token_sort_ratio,
|
71 |
+
processor=lambda x: x.lower()
|
72 |
+
)
|
73 |
if score >= threshold:
|
74 |
corrected.append(match)
|
75 |
i += n
|